Commit 74ef9527 authored by Yunxiang Li's avatar Yunxiang Li Committed by Christian König
Browse files

drm/amdgpu: track bo memory stats at runtime



Before, every time fdinfo is queried we try to lock all the BOs in the
VM and calculate memory usage from scratch. This works okay if the
fdinfo is rarely read and the VMs don't have a ton of BOs. If either of
these conditions is not true, we get a massive performance hit.

In this new revision, we track the BOs as they change states. This way
when the fdinfo is queried we only need to take the status lock and copy
out the usage stats with minimal impact to the runtime performance. With
this new approach however, we would no longer be able to track active
buffers.

Signed-off-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241219151411.1150-6-Yunxiang.Li@amd.com


Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
parent a541a6e8
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@
#include "amdgpu_gem.h"
#include "amdgpu_dma_buf.h"
#include "amdgpu_xgmi.h"
#include "amdgpu_vm.h"
#include <drm/amdgpu_drm.h>
#include <drm/ttm/ttm_tt.h>
#include <linux/dma-buf.h>
@@ -60,6 +61,8 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf,
	if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0)
		attach->peer2peer = false;

	amdgpu_vm_bo_update_shared(bo);

	return 0;
}

+6 −12
Original line number Diff line number Diff line
@@ -60,7 +60,7 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file)
	struct amdgpu_fpriv *fpriv = file->driver_priv;
	struct amdgpu_vm *vm = &fpriv->vm;

	struct amdgpu_mem_stats stats[__AMDGPU_PL_LAST + 1] = { };
	struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM];
	ktime_t usage[AMDGPU_HW_IP_NUM];
	const char *pl_name[] = {
		[TTM_PL_VRAM] = "vram",
@@ -72,15 +72,8 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file)
		[AMDGPU_PL_DOORBELL] = "doorbell",
	};
	unsigned int hw_ip, i;
	int ret;

	ret = amdgpu_bo_reserve(vm->root.bo, false);
	if (ret)
		return;

	amdgpu_vm_get_memory(vm, stats, ARRAY_SIZE(stats));
	amdgpu_bo_unreserve(vm->root.bo);

	amdgpu_vm_get_memory(vm, stats);
	amdgpu_ctx_mgr_usage(&fpriv->ctx_mgr, usage);

	/*
@@ -97,7 +90,6 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file)

		drm_print_memory_stats(p,
				       &stats[i].drm,
				       DRM_GEM_OBJECT_ACTIVE |
				       DRM_GEM_OBJECT_RESIDENT |
				       DRM_GEM_OBJECT_PURGEABLE,
				       pl_name[i]);
@@ -115,9 +107,11 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct drm_file *file)
	drm_printf(p, "amd-evicted-vram:\t%llu KiB\n",
		   stats[TTM_PL_VRAM].evicted/1024UL);
	drm_printf(p, "amd-requested-vram:\t%llu KiB\n",
		   stats[TTM_PL_VRAM].requested/1024UL);
		   (stats[TTM_PL_VRAM].drm.shared +
		    stats[TTM_PL_VRAM].drm.private) / 1024UL);
	drm_printf(p, "amd-requested-gtt:\t%llu KiB\n",
		   stats[TTM_PL_TT].requested/1024UL);
		   (stats[TTM_PL_TT].drm.shared +
		    stats[TTM_PL_TT].drm.private) / 1024UL);

	for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) {
		if (!usage[hw_ip])
+3 −0
Original line number Diff line number Diff line
@@ -42,6 +42,7 @@
#include "amdgpu_dma_buf.h"
#include "amdgpu_hmm.h"
#include "amdgpu_xgmi.h"
#include "amdgpu_vm.h"

static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
{
@@ -179,6 +180,7 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj,
	if (r)
		return r;

	amdgpu_vm_bo_update_shared(abo);
	bo_va = amdgpu_vm_bo_find(vm, abo);
	if (!bo_va)
		bo_va = amdgpu_vm_bo_add(adev, vm, abo);
@@ -252,6 +254,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object *obj,
		goto out_unlock;

	amdgpu_vm_bo_del(adev, bo_va);
	amdgpu_vm_bo_update_shared(bo);
	if (!amdgpu_vm_ready(vm))
		goto out_unlock;

+40 −70
Original line number Diff line number Diff line
@@ -1157,7 +1157,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
		return;

	abo = ttm_to_amdgpu_bo(bo);
	amdgpu_vm_bo_invalidate(abo, evict);
	amdgpu_vm_bo_move(abo, new_mem, evict);

	amdgpu_bo_kunmap(abo);

@@ -1170,75 +1170,6 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,
			     old_mem ? old_mem->mem_type : -1);
}

void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
			  struct amdgpu_mem_stats *stats,
			  unsigned int sz)
{
	const unsigned int domain_to_pl[] = {
		[ilog2(AMDGPU_GEM_DOMAIN_CPU)]	    = TTM_PL_SYSTEM,
		[ilog2(AMDGPU_GEM_DOMAIN_GTT)]	    = TTM_PL_TT,
		[ilog2(AMDGPU_GEM_DOMAIN_VRAM)]	    = TTM_PL_VRAM,
		[ilog2(AMDGPU_GEM_DOMAIN_GDS)]	    = AMDGPU_PL_GDS,
		[ilog2(AMDGPU_GEM_DOMAIN_GWS)]	    = AMDGPU_PL_GWS,
		[ilog2(AMDGPU_GEM_DOMAIN_OA)]	    = AMDGPU_PL_OA,
		[ilog2(AMDGPU_GEM_DOMAIN_DOORBELL)] = AMDGPU_PL_DOORBELL,
	};
	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
	struct ttm_resource *res = bo->tbo.resource;
	struct drm_gem_object *obj = &bo->tbo.base;
	uint64_t size = amdgpu_bo_size(bo);
	unsigned int type;

	if (!res) {
		/*
		 * If no backing store use one of the preferred domain for basic
		 * stats. We take the MSB since that should give a reasonable
		 * view.
		 */
		BUILD_BUG_ON(TTM_PL_VRAM < TTM_PL_TT ||
			     TTM_PL_VRAM < TTM_PL_SYSTEM);
		type = fls(bo->preferred_domains & AMDGPU_GEM_DOMAIN_MASK);
		if (!type)
			return;
		type--;
		if (drm_WARN_ON_ONCE(&adev->ddev,
				     type >= ARRAY_SIZE(domain_to_pl)))
			return;
		type = domain_to_pl[type];
	} else {
		type = res->mem_type;
	}

	if (drm_WARN_ON_ONCE(&adev->ddev, type >= sz))
		return;

	/* DRM stats common fields: */

	if (drm_gem_object_is_shared_for_memory_stats(obj))
		stats[type].drm.shared += size;
	else
		stats[type].drm.private += size;

	if (res) {
		stats[type].drm.resident += size;

		if (!dma_resv_test_signaled(obj->resv, DMA_RESV_USAGE_BOOKKEEP))
			stats[type].drm.active += size;
		else if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE)
			stats[type].drm.purgeable += size;
	}

	/* amdgpu specific stats: */

	if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) {
		stats[TTM_PL_VRAM].requested += size;
		if (type != TTM_PL_VRAM)
			stats[TTM_PL_VRAM].evicted += size;
	} else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) {
		stats[TTM_PL_TT].requested += size;
	}
}

/**
 * amdgpu_bo_release_notify - notification about a BO being released
 * @bo: pointer to a buffer object
@@ -1453,6 +1384,45 @@ u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo)
	return amdgpu_gmc_sign_extend(offset);
}

/**
 * amdgpu_bo_mem_stats_placement - bo placement for memory accounting
 * @bo:	the buffer object we should look at
 *
 * BO can have multiple preferred placements, to avoid double counting we want
 * to file it under a single placement for memory stats.
 * Luckily, if we take the highest set bit in preferred_domains the result is
 * quite sensible.
 *
 * Returns:
 * Which of the placements should the BO be accounted under.
 */
uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo)
{
	uint32_t domain = bo->preferred_domains & AMDGPU_GEM_DOMAIN_MASK;

	if (!domain)
		return TTM_PL_SYSTEM;

	switch (rounddown_pow_of_two(domain)) {
	case AMDGPU_GEM_DOMAIN_CPU:
		return TTM_PL_SYSTEM;
	case AMDGPU_GEM_DOMAIN_GTT:
		return TTM_PL_TT;
	case AMDGPU_GEM_DOMAIN_VRAM:
		return TTM_PL_VRAM;
	case AMDGPU_GEM_DOMAIN_GDS:
		return AMDGPU_PL_GDS;
	case AMDGPU_GEM_DOMAIN_GWS:
		return AMDGPU_PL_GWS;
	case AMDGPU_GEM_DOMAIN_OA:
		return AMDGPU_PL_OA;
	case AMDGPU_GEM_DOMAIN_DOORBELL:
		return AMDGPU_PL_DOORBELL;
	default:
		return TTM_PL_SYSTEM;
	}
}

/**
 * amdgpu_bo_get_preferred_domain - get preferred domain
 * @adev: amdgpu device object
+1 −3
Original line number Diff line number Diff line
@@ -300,9 +300,7 @@ int amdgpu_bo_sync_wait_resv(struct amdgpu_device *adev, struct dma_resv *resv,
int amdgpu_bo_sync_wait(struct amdgpu_bo *bo, void *owner, bool intr);
u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
			  struct amdgpu_mem_stats *stats,
			  unsigned int size);
uint32_t amdgpu_bo_mem_stats_placement(struct amdgpu_bo *bo);
uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
					    uint32_t domain);

Loading