Commit dcb49710 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86_cache_for_v7.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 resource control updates from Borislav Petkov:

 - Extend the resctrl machinery to support telemetry monitoring on
   Intel (Tony Luck)

   The practical usage of this is being able to tell how much energy or
   how much work can be attributed to a group of tasks tracked under a
   single idenitifier. Prepend this work with proper refactoring of
   resctrl domains handling code.

* tag 'x86_cache_for_v7.0_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (30 commits)
  x86,fs/resctrl: Update documentation for telemetry events
  x86/resctrl: Enable RDT_RESOURCE_PERF_PKG
  fs/resctrl: Move RMID initialization to first mount
  x86,fs/resctrl: Compute number of RMIDs as minimum across resources
  fs/resctrl: Move allocation/free of closid_num_dirty_rmid[]
  x86/resctrl: Handle number of RMIDs supported by RDT_RESOURCE_PERF_PKG
  x86/resctrl: Add energy/perf choices to rdt boot option
  x86,fs/resctrl: Handle domain creation/deletion for RDT_RESOURCE_PERF_PKG
  fs/resctrl: Refactor rmdir_mondata_subdir_allrdtgrp()
  fs/resctrl: Refactor mkdir_mondata_subdir()
  x86/resctrl: Read telemetry events
  x86/resctrl: Find and enable usable telemetry events
  x86,fs/resctrl: Add architectural event pointer
  x86,fs/resctrl: Fill in details of events for performance and energy GUIDs
  x86/resctrl: Discover hardware telemetry events
  fs/resctrl: Emphasize that L3 monitoring resource is required for summing domains
  x86,fs/resctrl: Add and initialize a resource for package scope monitoring
  x86,fs/resctrl: Add an architectural hook called for first mount
  x86,fs/resctrl: Support binary fixed point event counters
  x86,fs/resctrl: Handle events that can be read from any CPU
  ...
parents 75b2a603 a8848c4b
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -6354,9 +6354,14 @@ Kernel parameters
	rdt=		[HW,X86,RDT]
			Turn on/off individual RDT features. List is:
			cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp,
			mba, smba, bmec, abmc, sdciae.
			mba, smba, bmec, abmc, sdciae, energy[:guid],
			perf[:guid].
			E.g. to turn on cmt and turn off mba use:
				rdt=cmt,!mba
			To turn off all energy telemetry monitoring and ensure that
			perf telemetry monitoring associated with guid 0x12345
			is enabled use:
				rdt=!energy,perf:0x12345

	reboot=		[KNL]
			Format (x86 or x86_64):
+54 −12
Original line number Diff line number Diff line
@@ -252,13 +252,12 @@ with respect to allocation:
			bandwidth percentages are directly applied to
			the threads running on the core

If RDT monitoring is available there will be an "L3_MON" directory
If L3 monitoring is available there will be an "L3_MON" directory
with the following files:

"num_rmids":
		The number of RMIDs available. This is the
		upper bound for how many "CTRL_MON" + "MON"
		groups can be created.
		The number of RMIDs supported by hardware for
		L3 monitoring events.

"mon_features":
		Lists the monitoring events if
@@ -484,6 +483,24 @@ with the following files:
		bytes) at which a previously used LLC_occupancy
		counter can be considered for reuse.

If telemetry monitoring is available there will be a "PERF_PKG_MON" directory
with the following files:

"num_rmids":
		The number of RMIDs for telemetry monitoring events.

		On Intel resctrl will not enable telemetry events if the number of
		RMIDs that can be tracked concurrently is lower than the total number
		of RMIDs supported. Telemetry events can be force-enabled with the
		"rdt=" kernel parameter, but this may reduce the number of
		monitoring groups that can be created.

"mon_features":
		Lists the telemetry monitoring events that are enabled on this system.

The upper bound for how many "CTRL_MON" + "MON" can be created
is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values.

Finally, in the top level of the "info" directory there is a file
named "last_cmd_status". This is reset with every "command" issued
via the file system (making new directories or writing to any of the
@@ -589,15 +606,40 @@ When control is enabled all CTRL_MON groups will also contain:
When monitoring is enabled all MON groups will also contain:

"mon_data":
	This contains a set of files organized by L3 domain and by
	RDT event. E.g. on a system with two L3 domains there will
	be subdirectories "mon_L3_00" and "mon_L3_01".	Each of these
	directories have one file per event (e.g. "llc_occupancy",
	"mbm_total_bytes", and "mbm_local_bytes"). In a MON group these
	files provide a read out of the current value of the event for
	all tasks in the group. In CTRL_MON groups these files provide
	the sum for all tasks in the CTRL_MON group and all tasks in
	This contains directories for each monitor domain.

	If L3 monitoring is enabled, there will be a "mon_L3_XX" directory for
	each instance of an L3 cache. Each directory contains files for the enabled
	L3 events (e.g. "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes").

	If telemetry monitoring is enabled, there will be a "mon_PERF_PKG_YY"
	directory for each physical processor package. Each directory contains
	files for the enabled telemetry events (e.g. "core_energy". "activity",
	"uops_retired", etc.)

	The info/`*`/mon_features files provide the full list of enabled
	event/file names.

	"core energy" reports a floating point number for the energy (in Joules)
	consumed by cores (registers, arithmetic units, TLB and L1/L2 caches)
	during execution of instructions summed across all logical CPUs on a
	package for the current monitoring group.

	"activity" also reports a floating point value (in Farads).  This provides
	an estimate of work done independent of the frequency that the CPUs used
	for execution.

	Note that "core energy" and "activity" only measure energy/activity in the
	"core" of the CPU (arithmetic units, TLB, L1 and L2 caches, etc.). They
	do not include L3 cache, memory, I/O devices etc.

	All other events report decimal integer values.

	In a MON group these files provide a read out of the current value of
	the event for all tasks in the group. In CTRL_MON groups these files
	provide the sum for all tasks in the CTRL_MON group and all tasks in
	MON groups. Please see example section for more details on usage.

	On systems with Sub-NUMA Cluster (SNC) enabled there are extra
	directories for each node (located within the "mon_L3_XX" directory
	for the L3 cache they occupy). These are named "mon_sub_L3_YY"
+13 −0
Original line number Diff line number Diff line
@@ -541,6 +541,19 @@ config X86_CPU_RESCTRL

	  Say N if unsure.

config X86_CPU_RESCTRL_INTEL_AET
	bool "Intel Application Energy Telemetry"
	depends on X86_64 && X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y
	help
	  Enable per-RMID telemetry events in resctrl.

	  Intel feature that collects per-RMID execution data
	  about energy consumption, measure of frequency independent
	  activity and other performance metrics. Data is aggregated
	  per package.

	  Say N if unsure.

config X86_FRED
	bool "Flexible Return and Event Delivery"
	depends on X86_64
+1 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_X86_CPU_RESCTRL)		+= core.o rdtgroup.o monitor.o
obj-$(CONFIG_X86_CPU_RESCTRL)		+= ctrlmondata.o
obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET)	+= intel_aet.o
obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)	+= pseudo_lock.o

# To allow define_trace.h's recursive include:
+151 −73
Original line number Diff line number Diff line
@@ -100,14 +100,33 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
		},
	},
	[RDT_RESOURCE_PERF_PKG] =
	{
		.r_resctrl = {
			.name			= "PERF_PKG",
			.mon_scope		= RESCTRL_PACKAGE,
			.mon_domains		= mon_domain_init(RDT_RESOURCE_PERF_PKG),
		},
	},
};

/**
 * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs
 *				      (minimum across all mon_capable resource)
 *
 * Return: Number of supported RMIDs at time of call. Note that mount time
 * enumeration of resources may reduce the number.
 */
u32 resctrl_arch_system_num_rmid_idx(void)
{
	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
	u32 num_rmids = U32_MAX;
	struct rdt_resource *r;

	for_each_mon_capable_rdt_resource(r)
		num_rmids = min(num_rmids, r->mon.num_rmid);

	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
	return r->mon.num_rmid;
	return num_rmids == U32_MAX ? 0 : num_rmids;
}

struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
@@ -368,7 +387,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
	kfree(hw_dom);
}

static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom)
{
	int idx;

@@ -401,11 +420,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *
}

/**
 * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
 * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
 * @num_rmid:	The size of the MBM counter array
 * @hw_dom:	The domain that owns the allocated arrays
 *
 * Return:	0 for success, or -ENOMEM.
 */
static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom)
{
	size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
	enum resctrl_event_id eventid;
@@ -438,6 +459,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
		return get_cpu_cacheinfo_id(cpu, scope);
	case RESCTRL_L3_NODE:
		return cpu_to_node(cpu);
	case RESCTRL_PACKAGE:
		return topology_physical_package_id(cpu);
	default:
		break;
	}
@@ -464,7 +487,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)

	hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
	if (hdr) {
		if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
		if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
			return;
		d = container_of(hdr, struct rdt_ctrl_domain, hdr);

@@ -481,6 +504,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
	d = &hw_dom->d_resctrl;
	d->hdr.id = id;
	d->hdr.type = RESCTRL_CTRL_DOMAIN;
	d->hdr.rid = r->rid;
	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);

	rdt_domain_reconfigure_cdp(r);
@@ -500,37 +524,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
	}
}

static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos)
{
	int id = get_domain_id_from_scope(cpu, r->mon_scope);
	struct list_head *add_pos = NULL;
	struct rdt_hw_mon_domain *hw_dom;
	struct rdt_domain_hdr *hdr;
	struct rdt_mon_domain *d;
	struct rdt_hw_l3_mon_domain *hw_dom;
	struct rdt_l3_mon_domain *d;
	struct cacheinfo *ci;
	int err;

	lockdep_assert_held(&domain_list_lock);

	if (id < 0) {
		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
			     cpu, r->mon_scope, r->name);
		return;
	}

	hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
	if (hdr) {
		if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
			return;
		d = container_of(hdr, struct rdt_mon_domain, hdr);

		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
		/* Update the mbm_assign_mode state for the CPU if supported */
		if (r->mon.mbm_cntr_assignable)
			resctrl_arch_mbm_cntr_assign_set_one(r);
		return;
	}

	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
	if (!hw_dom)
		return;
@@ -538,33 +538,66 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
	d = &hw_dom->d_resctrl;
	d->hdr.id = id;
	d->hdr.type = RESCTRL_MON_DOMAIN;
	d->hdr.rid = RDT_RESOURCE_L3;
	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
	if (!ci) {
		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
		mon_domain_free(hw_dom);
		l3_mon_domain_free(hw_dom);
		return;
	}
	d->ci_id = ci->id;
	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);

	/* Update the mbm_assign_mode state for the CPU if supported */
	if (r->mon.mbm_cntr_assignable)
		resctrl_arch_mbm_cntr_assign_set_one(r);

	arch_mon_domain_online(r, d);

	if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
		mon_domain_free(hw_dom);
	if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
		l3_mon_domain_free(hw_dom);
		return;
	}

	list_add_tail_rcu(&d->hdr.list, add_pos);

	err = resctrl_online_mon_domain(r, d);
	err = resctrl_online_mon_domain(r, &d->hdr);
	if (err) {
		list_del_rcu(&d->hdr.list);
		synchronize_rcu();
		mon_domain_free(hw_dom);
		l3_mon_domain_free(hw_dom);
	}
}

static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
{
	int id = get_domain_id_from_scope(cpu, r->mon_scope);
	struct list_head *add_pos = NULL;
	struct rdt_domain_hdr *hdr;

	lockdep_assert_held(&domain_list_lock);

	if (id < 0) {
		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
			     cpu, r->mon_scope, r->name);
		return;
	}

	hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
	if (hdr)
		cpumask_set_cpu(cpu, &hdr->cpu_mask);

	switch (r->rid) {
	case RDT_RESOURCE_L3:
		/* Update the mbm_assign_mode state for the CPU if supported */
		if (r->mon.mbm_cntr_assignable)
			resctrl_arch_mbm_cntr_assign_set_one(r);
		if (!hdr)
			l3_mon_domain_setup(cpu, id, r, add_pos);
		break;
	case RDT_RESOURCE_PERF_PKG:
		if (!hdr)
			intel_aet_mon_domain_setup(cpu, id, r, add_pos);
		break;
	default:
		pr_warn_once("Unknown resource rid=%d\n", r->rid);
		break;
	}
}

@@ -598,16 +631,18 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
		return;
	}

	if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
	if (!cpumask_empty(&hdr->cpu_mask))
		return;

	if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid))
		return;

	d = container_of(hdr, struct rdt_ctrl_domain, hdr);
	hw_dom = resctrl_to_arch_ctrl_dom(d);

	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
	if (cpumask_empty(&d->hdr.cpu_mask)) {
	resctrl_offline_ctrl_domain(r, d);
		list_del_rcu(&d->hdr.list);
	list_del_rcu(&hdr->list);
	synchronize_rcu();

	/*
@@ -617,17 +652,12 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
	if (d->plr)
		d->plr->d = NULL;
	ctrl_domain_free(hw_dom);

		return;
	}
}

static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
{
	int id = get_domain_id_from_scope(cpu, r->mon_scope);
	struct rdt_hw_mon_domain *hw_dom;
	struct rdt_domain_hdr *hdr;
	struct rdt_mon_domain *d;

	lockdep_assert_held(&domain_list_lock);

@@ -644,20 +674,42 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
		return;
	}

	if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
	cpumask_clear_cpu(cpu, &hdr->cpu_mask);
	if (!cpumask_empty(&hdr->cpu_mask))
		return;

	d = container_of(hdr, struct rdt_mon_domain, hdr);
	hw_dom = resctrl_to_arch_mon_dom(d);
	switch (r->rid) {
	case RDT_RESOURCE_L3: {
		struct rdt_hw_l3_mon_domain *hw_dom;
		struct rdt_l3_mon_domain *d;

	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
	if (cpumask_empty(&d->hdr.cpu_mask)) {
		resctrl_offline_mon_domain(r, d);
		list_del_rcu(&d->hdr.list);
		if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3))
			return;

		d = container_of(hdr, struct rdt_l3_mon_domain, hdr);
		hw_dom = resctrl_to_arch_mon_dom(d);
		resctrl_offline_mon_domain(r, hdr);
		list_del_rcu(&hdr->list);
		synchronize_rcu();
		mon_domain_free(hw_dom);
		l3_mon_domain_free(hw_dom);
		break;
	}
	case RDT_RESOURCE_PERF_PKG: {
		struct rdt_perf_pkg_mon_domain *pkgd;

		if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG))
			return;

		pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr);
		resctrl_offline_mon_domain(r, hdr);
		list_del_rcu(&hdr->list);
		synchronize_rcu();
		kfree(pkgd);
		break;
	}
	default:
		pr_warn_once("Unknown resource rid=%d\n", r->rid);
		break;
	}
}

@@ -712,6 +764,28 @@ static int resctrl_arch_offline_cpu(unsigned int cpu)
	return 0;
}

void resctrl_arch_pre_mount(void)
{
	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl;
	int cpu;

	if (!intel_aet_get_events())
		return;

	/*
	 * Late discovery of telemetry events means the domains for the
	 * resource were not built. Do that now.
	 */
	cpus_read_lock();
	mutex_lock(&domain_list_lock);
	r->mon_capable = true;
	rdt_mon_capable = true;
	for_each_online_cpu(cpu)
		domain_add_cpu_mon(cpu, r);
	mutex_unlock(&domain_list_lock);
	cpus_read_unlock();
}

enum {
	RDT_FLAG_CMT,
	RDT_FLAG_MBM_TOTAL,
@@ -767,6 +841,8 @@ static int __init set_rdt_options(char *str)
		force_off = *tok == '!';
		if (force_off)
			tok++;
		if (intel_handle_aet_option(force_off, tok))
			continue;
		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
			if (strcmp(tok, o->name) == 0) {
				if (force_off)
@@ -889,15 +965,15 @@ static __init bool get_rdt_mon_resources(void)
	bool ret = false;

	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL);
		ret = true;
	}
	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL);
		ret = true;
	}
	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL);
		ret = true;
	}
	if (rdt_cpu_has(X86_FEATURE_ABMC))
@@ -906,7 +982,7 @@ static __init bool get_rdt_mon_resources(void)
	if (!ret)
		return false;

	return !rdt_get_mon_l3_config(r);
	return !rdt_get_l3_mon_config(r);
}

static __init void __check_quirks_intel(void)
@@ -1084,6 +1160,8 @@ late_initcall(resctrl_arch_late_init);

static void __exit resctrl_arch_exit(void)
{
	intel_aet_exit();

	cpuhp_remove_state(rdt_online);

	resctrl_exit();
Loading