Commit 9ff885ef authored by Matthew Brost's avatar Matthew Brost
Browse files

drm/xe: Convert GT stats to per-cpu counters



Current GT statistics use atomic64_t counters. Atomic operations incur
a global coherency penalty.

Transition to dynamic per-cpu counters using alloc_percpu(). This allows
stats to be incremented via this_cpu_add(), which compiles to a single
non-locking instruction. This approach keeps the hot-path updates local
to the CPU, avoiding expensive cross-core cache invalidation traffic.

Use for_each_possible_cpu() during aggregation and clear operations to
ensure data consistency across CPU hotplug events.

Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarStuart Summers <stuart.summers@intel.com>
Link: https://patch.msgid.link/20260217200552.596718-1-matthew.brost@intel.com
parent 48eb073c
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include "xe_gt_printk.h"
#include "xe_gt_sriov_pf.h"
#include "xe_gt_sriov_vf.h"
#include "xe_gt_stats.h"
#include "xe_gt_sysfs.h"
#include "xe_gt_topology.h"
#include "xe_guc_exec_queue_types.h"
@@ -455,6 +456,10 @@ int xe_gt_init_early(struct xe_gt *gt)
	if (err)
		return err;

	err = xe_gt_stats_init(gt);
	if (err)
		return err;

	CLASS(xe_force_wake, fw_ref)(gt_to_fw(gt), XE_FW_GT);
	if (!fw_ref.domains)
		return -ETIMEDOUT;
+51 −12
Original line number Diff line number Diff line
@@ -3,12 +3,37 @@
 * Copyright © 2024 Intel Corporation
 */

#include <linux/atomic.h>

#include <drm/drm_managed.h>
#include <drm/drm_print.h>

#include "xe_device.h"
#include "xe_gt_stats.h"
#include "xe_gt_types.h"

static void xe_gt_stats_fini(struct drm_device *drm, void *arg)
{
	struct xe_gt *gt = arg;

	free_percpu(gt->stats);
}

/**
 * xe_gt_stats_init() - Initialize GT statistics
 * @gt: GT structure
 *
 * Allocate per-CPU GT statistics. Using per-CPU stats allows increments
 * to occur without cross-CPU atomics.
 *
 * Return: 0 on success, -ENOMEM on failure.
 */
int xe_gt_stats_init(struct xe_gt *gt)
{
	gt->stats = alloc_percpu(struct xe_gt_stats);
	if (!gt->stats)
		return -ENOMEM;

	return drmm_add_action_or_reset(&gt_to_xe(gt)->drm, xe_gt_stats_fini,
					gt);
}

/**
 * xe_gt_stats_incr - Increments the specified stats counter
@@ -23,7 +48,7 @@ void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr)
	if (id >= __XE_GT_STATS_NUM_IDS)
		return;

	atomic64_add(incr, &gt->stats.counters[id]);
	this_cpu_add(gt->stats->counters[id], incr);
}

#define DEF_STAT_STR(ID, name) [XE_GT_STATS_ID_##ID] = name
@@ -94,23 +119,37 @@ int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p)
{
	enum xe_gt_stats_id id;

	for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id)
		drm_printf(p, "%s: %lld\n", stat_description[id],
			   atomic64_read(&gt->stats.counters[id]));
	for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id) {
		u64 total = 0;
		int cpu;

		for_each_possible_cpu(cpu) {
			struct xe_gt_stats *s = per_cpu_ptr(gt->stats, cpu);

			total += s->counters[id];
		}

		drm_printf(p, "%s: %lld\n", stat_description[id], total);
	}

	return 0;
}

/**
 * xe_gt_stats_clear - Clear the GT stats
 * xe_gt_stats_clear() - Clear the GT stats
 * @gt: GT structure
 *
 * This clear (zeros) all the available GT stats.
 * Clear (zero) all available GT stats. Note that if the stats are being
 * updated while this function is running, the results may be unpredictable.
 * Intended to be called on an idle GPU.
 */
void xe_gt_stats_clear(struct xe_gt *gt)
{
	int id;
	int cpu;

	for (id = 0; id < ARRAY_SIZE(gt->stats.counters); ++id)
		atomic64_set(&gt->stats.counters[id], 0);
	for_each_possible_cpu(cpu) {
		struct xe_gt_stats *s = per_cpu_ptr(gt->stats, cpu);

		memset(s, 0, sizeof(*s));
	}
}
+6 −0
Original line number Diff line number Diff line
@@ -14,10 +14,16 @@ struct xe_gt;
struct drm_printer;

#ifdef CONFIG_DEBUG_FS
int xe_gt_stats_init(struct xe_gt *gt);
int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p);
void xe_gt_stats_clear(struct xe_gt *gt);
void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr);
#else
static inline int xe_gt_stats_init(struct xe_gt *gt)
{
	return 0;
}

static inline void
xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id,
		 int incr)
+19 −0
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@
#ifndef _XE_GT_STATS_TYPES_H_
#define _XE_GT_STATS_TYPES_H_

#include <linux/types.h>

enum xe_gt_stats_id {
	XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT,
	XE_GT_STATS_ID_TLB_INVAL,
@@ -58,4 +60,21 @@ enum xe_gt_stats_id {
	__XE_GT_STATS_NUM_IDS,
};

/**
 * struct xe_gt_stats - Per-CPU GT statistics counters
 * @counters: Array of 64-bit counters indexed by &enum xe_gt_stats_id
 *
 * This structure is used for high-frequency, per-CPU statistics collection
 * in the Xe driver. By using a per-CPU allocation and ensuring the structure
 * is cache-line aligned, we avoid the performance-heavy atomics and cache
 * coherency traffic.
 *
 * Updates to these counters should be performed using the this_cpu_add()
 * macro to ensure they are atomic with respect to local interrupts and
 * preemption-safe without the overhead of explicit locking.
 */
struct xe_gt_stats {
	u64 counters[__XE_GT_STATS_NUM_IDS];
} ____cacheline_aligned;

#endif
+1 −4
Original line number Diff line number Diff line
@@ -158,10 +158,7 @@ struct xe_gt {

#if IS_ENABLED(CONFIG_DEBUG_FS)
	/** @stats: GT stats */
	struct {
		/** @stats.counters: counters for various GT stats */
		atomic64_t counters[__XE_GT_STATS_NUM_IDS];
	} stats;
	struct xe_gt_stats __percpu *stats;
#endif

	/**