Commit ad9843aa authored by Arvind Yadav's avatar Arvind Yadav Committed by Matthew Brost
Browse files

drm/xe/madvise: Implement purgeable buffer object support



This allows userspace applications to provide memory usage hints to
the kernel for better memory management under pressure:

Add the core implementation for purgeable buffer objects, enabling
memory reclamation of user-designated DONTNEED buffers during eviction.

This patch implements the purge operation and state machine transitions:

Purgeable States (from xe_madv_purgeable_state):
 - WILLNEED (0): BO should be retained, actively used
 - DONTNEED (1): BO eligible for purging, not currently needed
 - PURGED (2): BO backing store reclaimed, permanently invalid

Design Rationale:
  - Async TLB invalidation via trigger_rebind (no blocking
    xe_vm_invalidate_vma)
  - i915 compatibility: retained field, "once purged always purged"
    semantics
  - Shared BO protection prevents multi-process memory corruption
  - Scratch PTE reuse avoids new infrastructure, safe for fault mode

Note: The madvise_purgeable() function is implemented but not hooked
into the IOCTL handler (madvise_funcs[] entry is NULL) to maintain
bisectability. The feature will be enabled in the final patch when all
supporting infrastructure (shrinker, per-VMA tracking) is complete.

Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Reviewed-by: default avatarThomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: default avatarArvind Yadav <arvind.yadav@intel.com>
Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Link: https://patch.msgid.link/20260326130843.3545241-4-arvind.yadav@intel.com
parent b67427f9
Loading
Loading
Loading
Loading
+95 −12
Original line number Diff line number Diff line
@@ -838,6 +838,84 @@ static int xe_bo_move_notify(struct xe_bo *bo,
	return 0;
}

/**
 * xe_bo_set_purgeable_state() - Set BO purgeable state with validation
 * @bo: Buffer object
 * @new_state: New purgeable state
 *
 * Sets the purgeable state with lockdep assertions and validates state
 * transitions. Once a BO is PURGED, it cannot transition to any other state.
 * Invalid transitions are caught with xe_assert().
 */
void xe_bo_set_purgeable_state(struct xe_bo *bo,
			       enum xe_madv_purgeable_state new_state)
{
	struct xe_device *xe = xe_bo_device(bo);

	xe_bo_assert_held(bo);

	/* Validate state is one of the known values */
	xe_assert(xe, new_state == XE_MADV_PURGEABLE_WILLNEED ||
		  new_state == XE_MADV_PURGEABLE_DONTNEED ||
		  new_state == XE_MADV_PURGEABLE_PURGED);

	/* Once purged, always purged - cannot transition out */
	xe_assert(xe, !(bo->madv_purgeable == XE_MADV_PURGEABLE_PURGED &&
			new_state != XE_MADV_PURGEABLE_PURGED));

	bo->madv_purgeable = new_state;
}

/**
 * xe_ttm_bo_purge() - Purge buffer object backing store
 * @ttm_bo: The TTM buffer object to purge
 * @ctx: TTM operation context
 *
 * This function purges the backing store of a BO marked as DONTNEED and
 * triggers rebind to invalidate stale GPU mappings. For fault-mode VMs,
 * this zaps the PTEs. The next GPU access will trigger a page fault and
 * perform NULL rebind (scratch pages or clear PTEs based on VM config).
 *
 * Return: 0 on success, negative error code on failure
 */
static int xe_ttm_bo_purge(struct ttm_buffer_object *ttm_bo, struct ttm_operation_ctx *ctx)
{
	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);
	struct ttm_placement place = {};
	int ret;

	xe_bo_assert_held(bo);

	if (!ttm_bo->ttm)
		return 0;

	if (!xe_bo_madv_is_dontneed(bo))
		return 0;

	/*
	 * Use the standard pre-move hook so we share the same cleanup/invalidate
	 * path as migrations: drop any CPU vmap and schedule the necessary GPU
	 * unbind/rebind work.
	 *
	 * This must be called before ttm_bo_validate() frees the pages.
	 * May fail in no-wait contexts (fault/shrinker) or if the BO is
	 * pinned. Keep state unchanged on failure so we don't end up "PURGED"
	 * with stale mappings.
	 */
	ret = xe_bo_move_notify(bo, ctx);
	if (ret)
		return ret;

	ret = ttm_bo_validate(ttm_bo, &place, ctx);
	if (ret)
		return ret;

	/* Commit the state transition only once invalidation was queued */
	xe_bo_set_purgeable_state(bo, XE_MADV_PURGEABLE_PURGED);

	return 0;
}

static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
		      struct ttm_operation_ctx *ctx,
		      struct ttm_resource *new_mem,
@@ -857,6 +935,20 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
				  ttm && ttm_tt_is_populated(ttm)) ? true : false;
	int ret = 0;

	/*
	 * Purge only non-shared BOs explicitly marked DONTNEED by userspace.
	 * The move_notify callback will handle invalidation asynchronously.
	 */
	if (evict && xe_bo_madv_is_dontneed(bo)) {
		ret = xe_ttm_bo_purge(ttm_bo, ctx);
		if (ret)
			return ret;

		/* Free the unused eviction destination resource */
		ttm_resource_free(ttm_bo, &new_mem);
		return 0;
	}

	/* Bo creation path, moving to system or TT. */
	if ((!old_mem && ttm) && !handle_system_ccs) {
		if (new_mem->mem_type == XE_PL_TT)
@@ -1606,18 +1698,6 @@ static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
	}
}

static void xe_ttm_bo_purge(struct ttm_buffer_object *ttm_bo, struct ttm_operation_ctx *ctx)
{
	struct xe_device *xe = ttm_to_xe_device(ttm_bo->bdev);

	if (ttm_bo->ttm) {
		struct ttm_placement place = {};
		int ret = ttm_bo_validate(ttm_bo, &place, ctx);

		drm_WARN_ON(&xe->drm, ret);
	}
}

static void xe_ttm_bo_swap_notify(struct ttm_buffer_object *ttm_bo)
{
	struct ttm_operation_ctx ctx = {
@@ -2198,6 +2278,9 @@ struct xe_bo *xe_bo_init_locked(struct xe_device *xe, struct xe_bo *bo,
#endif
	INIT_LIST_HEAD(&bo->vram_userfault_link);

	/* Initialize purge advisory state */
	bo->madv_purgeable = XE_MADV_PURGEABLE_WILLNEED;

	drm_gem_private_object_init(&xe->drm, &bo->ttm.base, size);

	if (resv) {
+2 −0
Original line number Diff line number Diff line
@@ -271,6 +271,8 @@ static inline bool xe_bo_madv_is_dontneed(struct xe_bo *bo)
	return bo->madv_purgeable == XE_MADV_PURGEABLE_DONTNEED;
}

void xe_bo_set_purgeable_state(struct xe_bo *bo, enum xe_madv_purgeable_state new_state);

static inline void xe_bo_unpin_map_no_vm(struct xe_bo *bo)
{
	if (likely(bo)) {
+14 −1
Original line number Diff line number Diff line
@@ -59,6 +59,19 @@ static int xe_pagefault_begin(struct drm_exec *exec, struct xe_vma *vma,
	if (!bo)
		return 0;

	/*
	 * Skip validate/migrate for DONTNEED/purged BOs - repopulating
	 * their pages would prevent the shrinker from reclaiming them.
	 * For non-scratch VMs there is no safe fallback so fail the fault.
	 * For scratch VMs let xe_vma_rebind() run normally; it will install
	 * scratch PTEs so the GPU gets safe zero reads instead of faulting.
	 */
	if (unlikely(xe_bo_madv_is_dontneed(bo) || xe_bo_is_purged(bo))) {
		if (!xe_vm_has_scratch(vm))
			return -EACCES;
		return 0;
	}

	return need_vram_move ? xe_bo_migrate(bo, vram->placement, NULL, exec) :
		xe_bo_validate(bo, vm, true, exec);
}
@@ -145,7 +158,7 @@ static struct xe_vm *xe_pagefault_asid_to_vm(struct xe_device *xe, u32 asid)

	down_read(&xe->usm.lock);
	vm = xa_load(&xe->usm.asid_to_vm, asid);
	if (vm && xe_vm_in_fault_mode(vm))
	if (vm && (xe_vm_in_fault_mode(vm) || xe_vm_has_scratch(vm)))
		xe_vm_get(vm);
	else
		vm = ERR_PTR(-EINVAL);
+33 −7
Original line number Diff line number Diff line
@@ -531,20 +531,26 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
	/* Is this a leaf entry ?*/
	if (level == 0 || xe_pt_hugepte_possible(addr, next, level, xe_walk)) {
		struct xe_res_cursor *curs = xe_walk->curs;
		bool is_null = xe_vma_is_null(xe_walk->vma);
		bool is_vram = is_null ? false : xe_res_is_vram(curs);
		struct xe_bo *bo = xe_vma_bo(xe_walk->vma);
		bool is_null_or_purged = xe_vma_is_null(xe_walk->vma) ||
					 (bo && xe_bo_is_purged(bo));
		bool is_vram = is_null_or_purged ? false : xe_res_is_vram(curs);

		XE_WARN_ON(xe_walk->va_curs_start != addr);

		if (xe_walk->clear_pt) {
			pte = 0;
		} else {
			pte = vm->pt_ops->pte_encode_vma(is_null ? 0 :
			/*
			 * For purged BOs, treat like null VMAs - pass address 0.
			 * The pte_encode_vma will set XE_PTE_NULL flag for scratch mapping.
			 */
			pte = vm->pt_ops->pte_encode_vma(is_null_or_purged ? 0 :
							 xe_res_dma(curs) +
							 xe_walk->dma_offset,
							 xe_walk->vma,
							 pat_index, level);
			if (!is_null)
			if (!is_null_or_purged)
				pte |= is_vram ? xe_walk->default_vram_pte :
					xe_walk->default_system_pte;

@@ -568,7 +574,7 @@ xe_pt_stage_bind_entry(struct xe_ptw *parent, pgoff_t offset,
		if (unlikely(ret))
			return ret;

		if (!is_null && !xe_walk->clear_pt)
		if (!is_null_or_purged && !xe_walk->clear_pt)
			xe_res_next(curs, next - addr);
		xe_walk->va_curs_start = next;
		xe_walk->vma->gpuva.flags |= (XE_VMA_PTE_4K << level);
@@ -721,6 +727,26 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
	};
	struct xe_pt *pt = vm->pt_root[tile->id];
	int ret;
	bool is_purged = false;

	/*
	 * Check if BO is purged:
	 * - Scratch VMs: Use scratch PTEs (XE_PTE_NULL) for safe zero reads
	 * - Non-scratch VMs: Clear PTEs to zero (non-present) to avoid mapping to phys addr 0
	 *
	 * For non-scratch VMs, we force clear_pt=true so leaf PTEs become completely
	 * zero instead of creating a PRESENT mapping to physical address 0.
	 */
	if (bo && xe_bo_is_purged(bo)) {
		is_purged = true;

		/*
		 * For non-scratch VMs, a NULL rebind should use zero PTEs
		 * (non-present), not a present PTE to phys 0.
		 */
		if (!xe_vm_has_scratch(vm))
			xe_walk.clear_pt = true;
	}

	if (range) {
		/* Move this entire thing to xe_svm.c? */
@@ -756,11 +782,11 @@ xe_pt_stage_bind(struct xe_tile *tile, struct xe_vma *vma,
	}

	xe_walk.default_vram_pte |= XE_PPGTT_PTE_DM;
	xe_walk.dma_offset = bo ? vram_region_gpu_offset(bo->ttm.resource) : 0;
	xe_walk.dma_offset = (bo && !is_purged) ? vram_region_gpu_offset(bo->ttm.resource) : 0;
	if (!range)
		xe_bo_assert_held(bo);

	if (!xe_vma_is_null(vma) && !range) {
	if (!xe_vma_is_null(vma) && !range && !is_purged) {
		if (xe_vma_is_userptr(vma))
			xe_res_first_dma(to_userptr_vma(vma)->userptr.pages.dma_addr, 0,
					 xe_vma_size(vma), &curs);
+18 −2
Original line number Diff line number Diff line
@@ -327,6 +327,7 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
{
	struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
	struct xe_bo *bo = gem_to_xe_bo(vm_bo->obj);
	struct drm_gpuva *gpuva;
	int ret;

@@ -335,10 +336,16 @@ static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
		list_move_tail(&gpuva_to_vma(gpuva)->combined_links.rebind,
			       &vm->rebind_list);

	/* Skip re-populating purged BOs, rebind maps scratch pages. */
	if (xe_bo_is_purged(bo)) {
		vm_bo->evicted = false;
		return 0;
	}

	if (!try_wait_for_completion(&vm->xe->pm_block))
		return -EAGAIN;

	ret = xe_bo_validate(gem_to_xe_bo(vm_bo->obj), vm, false, exec);
	ret = xe_bo_validate(bo, vm, false, exec);
	if (ret)
		return ret;

@@ -1427,6 +1434,9 @@ static u64 xelp_pte_encode_bo(struct xe_bo *bo, u64 bo_offset,
static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
			       u16 pat_index, u32 pt_level)
{
	struct xe_bo *bo = xe_vma_bo(vma);
	struct xe_vm *vm = xe_vma_vm(vma);

	pte |= XE_PAGE_PRESENT;

	if (likely(!xe_vma_read_only(vma)))
@@ -1435,7 +1445,13 @@ static u64 xelp_pte_encode_vma(u64 pte, struct xe_vma *vma,
	pte |= pte_encode_pat_index(pat_index, pt_level);
	pte |= pte_encode_ps(pt_level);

	if (unlikely(xe_vma_is_null(vma)))
	/*
	 * NULL PTEs redirect to scratch page (return zeros on read).
	 * Set for: 1) explicit null VMAs, 2) purged BOs on scratch VMs.
	 * Never set NULL flag without scratch page - causes undefined behavior.
	 */
	if (unlikely(xe_vma_is_null(vma) ||
		     (bo && xe_bo_is_purged(bo) && xe_vm_has_scratch(vm))))
		pte |= XE_PTE_NULL;

	return pte;
Loading