Commit d4843615 authored by Ian Rogers's avatar Ian Rogers Committed by Arnaldo Carvalho de Melo
Browse files

perf evlist: Reduce affinity use and move into iterator, fix no affinity



The evlist__for_each_cpu iterator will call sched_setaffitinity when
moving between CPUs to avoid IPIs.

If only 1 IPI is saved then this may be unprofitable as the delay to get
scheduled may be considerable.

This may be particularly true if reading an event group in `perf stat`
in interval mode.

Move the affinity handling completely into the iterator so that a single
evlist__use_affinity can determine whether CPU affinities will be used.

For `perf record` the change is minimal as the dummy event and the real
event will always make the use of affinities the thing to do.

In `perf stat`, tool events are ignored and affinities only used if >1
event on the same CPU occur.

Determining if affinities are useful is done by evlist__use_affinity
which tests per-event whether the event's PMU benefits from affinity use
- it is assumed only perf event using PMUs do.

Fix a bug where when there are no affinities that the CPU map iterator
may reference a CPU not present in the initial evsel. Fix by making the
iterator and non-iterator code common.

Signed-off-by: default avatarIan Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andres Freund <andres@anarazel.de>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Falcon <thomas.falcon@intel.com>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Yang Li <yang.lee@linux.alibaba.com>
Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
parent 47172912
Loading
Loading
Loading
Loading
+44 −64
Original line number Diff line number Diff line
@@ -369,19 +369,11 @@ static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
static int read_counters_with_affinity(void)
{
	struct evlist_cpu_iterator evlist_cpu_itr;
	struct affinity saved_affinity, *affinity;

	if (all_counters_use_bpf)
		return 0;

	if (!target__has_cpu(&target) || target__has_per_thread(&target))
		affinity = NULL;
	else if (affinity__setup(&saved_affinity) < 0)
		return -1;
	else
		affinity = &saved_affinity;

	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
	evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
		struct evsel *counter = evlist_cpu_itr.evsel;

		if (evsel__is_bpf(counter))
@@ -393,8 +385,6 @@ static int read_counters_with_affinity(void)
		if (!counter->err)
			counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
	}
	if (affinity)
		affinity__cleanup(&saved_affinity);

	return 0;
}
@@ -793,7 +783,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
	const bool forks = (argc > 0);
	bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false;
	struct evlist_cpu_iterator evlist_cpu_itr;
	struct affinity saved_affinity, *affinity = NULL;
	int err, open_err = 0;
	bool second_pass = false, has_supported_counters;

@@ -805,14 +794,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
		child_pid = evsel_list->workload.pid;
	}

	if (!cpu_map__is_dummy(evsel_list->core.user_requested_cpus)) {
		if (affinity__setup(&saved_affinity) < 0) {
			err = -1;
			goto err_out;
		}
		affinity = &saved_affinity;
	}

	evlist__for_each_entry(evsel_list, counter) {
		counter->reset_group = false;
		if (bpf_counter__load(counter, &target)) {
@@ -825,15 +806,13 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)

	evlist__reset_aggr_stats(evsel_list);

	evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
		counter = evlist_cpu_itr.evsel;

	/*
	 * bperf calls evsel__open_per_cpu() in bperf__load(), so
	 * no need to call it again here.
	 */
		if (target.use_bpf)
			break;
	if (!target.use_bpf) {
		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
			counter = evlist_cpu_itr.evsel;

			if (counter->reset_group || !counter->supported)
				continue;
@@ -847,11 +826,12 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)

				open_err = errno;
				/*
			 * Weak group failed. We cannot just undo this here
			 * because earlier CPUs might be in group mode, and the kernel
			 * doesn't support mixing group and non group reads. Defer
			 * it to later.
			 * Don't close here because we're in the wrong affinity.
				 * Weak group failed. We cannot just undo this
				 * here because earlier CPUs might be in group
				 * mode, and the kernel doesn't support mixing
				 * group and non group reads. Defer it to later.
				 * Don't close here because we're in the wrong
				 * affinity.
				 */
				if ((open_err == EINVAL || open_err == EBADF) &&
					evsel__leader(counter) != counter &&
@@ -867,7 +847,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
					break;
			}
		}

	}
	if (second_pass) {
		/*
		 * Now redo all the weak group after closing them,
@@ -875,7 +855,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
		 */

		/* First close errored or weak retry */
		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
			counter = evlist_cpu_itr.evsel;

			if (!counter->reset_group && counter->supported)
@@ -884,7 +864,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
			perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx);
		}
		/* Now reopen weak */
		evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
		evlist__for_each_cpu(evlist_cpu_itr, evsel_list) {
			counter = evlist_cpu_itr.evsel;

			if (!counter->reset_group)
@@ -893,17 +873,18 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
			while (true) {
				pr_debug2("reopening weak %s\n", evsel__name(counter));
				if (create_perf_stat_counter(counter, &stat_config,
							     evlist_cpu_itr.cpu_map_idx) == 0)
							     evlist_cpu_itr.cpu_map_idx) == 0) {
					evlist_cpu_iterator__exit(&evlist_cpu_itr);
					break;

				}
				open_err = errno;
				if (stat_handle_error(counter, open_err) != COUNTER_RETRY)
				if (stat_handle_error(counter, open_err) != COUNTER_RETRY) {
					evlist_cpu_iterator__exit(&evlist_cpu_itr);
					break;
				}
			}
		}
	affinity__cleanup(affinity);
	affinity = NULL;
	}

	has_supported_counters = false;
	evlist__for_each_entry(evsel_list, counter) {
@@ -1065,7 +1046,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
	if (forks)
		evlist__cancel_workload(evsel_list);

	affinity__cleanup(affinity);
	return err;
}

+98 −60
Original line number Diff line number Diff line
@@ -359,36 +359,111 @@ int evlist__add_newtp(struct evlist *evlist, const char *sys, const char *name,
}
#endif

struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity)
/*
 * Should sched_setaffinity be used with evlist__for_each_cpu? Determine if
 * migrating the thread will avoid possibly numerous IPIs.
 */
static bool evlist__use_affinity(struct evlist *evlist)
{
	struct evlist_cpu_iterator itr = {
	struct evsel *pos;
	struct perf_cpu_map *used_cpus = NULL;
	bool ret = false;

	/*
	 * With perf record core.user_requested_cpus is usually NULL.
	 * Use the old method to handle this for now.
	 */
	if (!evlist->core.user_requested_cpus ||
	    cpu_map__is_dummy(evlist->core.user_requested_cpus))
		return false;

	evlist__for_each_entry(evlist, pos) {
		struct perf_cpu_map *intersect;

		if (!perf_pmu__benefits_from_affinity(pos->pmu))
			continue;

		if (evsel__is_dummy_event(pos)) {
			/*
			 * The dummy event is opened on all CPUs so assume >1
			 * event with shared CPUs.
			 */
			ret = true;
			break;
		}
		if (evsel__is_retire_lat(pos)) {
			/*
			 * Retirement latency events are similar to tool ones in
			 * their implementation, and so don't require affinity.
			 */
			continue;
		}
		if (perf_cpu_map__is_empty(used_cpus)) {
			/* First benefitting event, we want >1 on a common CPU. */
			used_cpus = perf_cpu_map__get(pos->core.cpus);
			continue;
		}
		if ((pos->core.attr.read_format & PERF_FORMAT_GROUP) &&
		    evsel__leader(pos) != pos) {
			/* Skip members of the same sample group. */
			continue;
		}
		intersect = perf_cpu_map__intersect(used_cpus, pos->core.cpus);
		if (!perf_cpu_map__is_empty(intersect)) {
			/* >1 event with shared CPUs. */
			perf_cpu_map__put(intersect);
			ret = true;
			break;
		}
		perf_cpu_map__put(intersect);
		perf_cpu_map__merge(&used_cpus, pos->core.cpus);
	}
	perf_cpu_map__put(used_cpus);
	return ret;
}

void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist)
{
	*itr = (struct evlist_cpu_iterator){
		.container = evlist,
		.evsel = NULL,
		.cpu_map_idx = 0,
		.evlist_cpu_map_idx = 0,
		.evlist_cpu_map_nr = perf_cpu_map__nr(evlist->core.all_cpus),
		.cpu = (struct perf_cpu){ .cpu = -1},
		.affinity = affinity,
		.affinity = NULL,
	};

	if (evlist__empty(evlist)) {
		/* Ensure the empty list doesn't iterate. */
		itr.evlist_cpu_map_idx = itr.evlist_cpu_map_nr;
	} else {
		itr.evsel = evlist__first(evlist);
		if (itr.affinity) {
			itr.cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
			affinity__set(itr.affinity, itr.cpu.cpu);
			itr.cpu_map_idx = perf_cpu_map__idx(itr.evsel->core.cpus, itr.cpu);
		itr->evlist_cpu_map_idx = itr->evlist_cpu_map_nr;
		return;
	}

	if (evlist__use_affinity(evlist)) {
		if (affinity__setup(&itr->saved_affinity) == 0)
			itr->affinity = &itr->saved_affinity;
	}
	itr->evsel = evlist__first(evlist);
	itr->cpu = perf_cpu_map__cpu(evlist->core.all_cpus, 0);
	if (itr->affinity)
		affinity__set(itr->affinity, itr->cpu.cpu);
	itr->cpu_map_idx = perf_cpu_map__idx(itr->evsel->core.cpus, itr->cpu);
	/*
	 * If this CPU isn't in the evsel's cpu map then advance
	 * through the list.
	 */
			if (itr.cpu_map_idx == -1)
				evlist_cpu_iterator__next(&itr);
		}
	if (itr->cpu_map_idx == -1)
		evlist_cpu_iterator__next(itr);
}
	return itr;

void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr)
{
	if (!itr->affinity)
		return;

	affinity__cleanup(itr->affinity);
	itr->affinity = NULL;
}

void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
@@ -418,14 +493,11 @@ void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr)
		 */
		if (evlist_cpu_itr->cpu_map_idx == -1)
			evlist_cpu_iterator__next(evlist_cpu_itr);
	} else {
		evlist_cpu_iterator__exit(evlist_cpu_itr);
	}
}

bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
{
	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
}

static int evsel__strcmp(struct evsel *pos, char *evsel_name)
{
	if (!evsel_name)
@@ -453,19 +525,11 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
{
	struct evsel *pos;
	struct evlist_cpu_iterator evlist_cpu_itr;
	struct affinity saved_affinity, *affinity = NULL;
	bool has_imm = false;

	// See explanation in evlist__close()
	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
		if (affinity__setup(&saved_affinity) < 0)
			return;
		affinity = &saved_affinity;
	}

	/* Disable 'immediate' events last */
	for (int imm = 0; imm <= 1; imm++) {
		evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
		evlist__for_each_cpu(evlist_cpu_itr, evlist) {
			pos = evlist_cpu_itr.evsel;
			if (evsel__strcmp(pos, evsel_name))
				continue;
@@ -483,7 +547,6 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name, bool excl
			break;
	}

	affinity__cleanup(affinity);
	evlist__for_each_entry(evlist, pos) {
		if (evsel__strcmp(pos, evsel_name))
			continue;
@@ -523,16 +586,8 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
{
	struct evsel *pos;
	struct evlist_cpu_iterator evlist_cpu_itr;
	struct affinity saved_affinity, *affinity = NULL;

	// See explanation in evlist__close()
	if (!cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
		if (affinity__setup(&saved_affinity) < 0)
			return;
		affinity = &saved_affinity;
	}

	evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) {
	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
		pos = evlist_cpu_itr.evsel;
		if (evsel__strcmp(pos, evsel_name))
			continue;
@@ -542,7 +597,6 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name, bool excl_
			continue;
		evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx);
	}
	affinity__cleanup(affinity);
	evlist__for_each_entry(evlist, pos) {
		if (evsel__strcmp(pos, evsel_name))
			continue;
@@ -1339,30 +1393,14 @@ void evlist__close(struct evlist *evlist)
{
	struct evsel *evsel;
	struct evlist_cpu_iterator evlist_cpu_itr;
	struct affinity affinity;

	/*
	 * With perf record core.user_requested_cpus is usually NULL.
	 * Use the old method to handle this for now.
	 */
	if (!evlist->core.user_requested_cpus ||
	    cpu_map__is_dummy(evlist->core.user_requested_cpus)) {
		evlist__for_each_entry_reverse(evlist, evsel)
			evsel__close(evsel);
		return;
	}

	if (affinity__setup(&affinity) < 0)
		return;

	evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) {
	evlist__for_each_cpu(evlist_cpu_itr, evlist) {
		if (evlist_cpu_itr.cpu_map_idx == 0 && evsel__is_retire_lat(evlist_cpu_itr.evsel))
			evsel__tpebs_close(evlist_cpu_itr.evsel);
		perf_evsel__close_cpu(&evlist_cpu_itr.evsel->core,
				      evlist_cpu_itr.cpu_map_idx);
	}

	affinity__cleanup(&affinity);
	evlist__for_each_entry_reverse(evlist, evsel) {
		perf_evsel__free_fd(&evsel->core);
		perf_evsel__free_id(&evsel->core);
+19 −7
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <internal/evlist.h>
#include <internal/evsel.h>
#include <perf/evlist.h>
#include "affinity.h"
#include "events_stats.h"
#include "evsel.h"
#include "rblist.h"
@@ -363,6 +364,8 @@ struct evlist_cpu_iterator {
	struct perf_cpu cpu;
	/** If present, used to set the affinity when switching between CPUs. */
	struct affinity *affinity;
	/** Maybe be used to hold affinity state prior to iterating. */
	struct affinity saved_affinity;
};

/**
@@ -370,22 +373,31 @@ struct evlist_cpu_iterator {
 *                        affinity, iterate over all CPUs and then the evlist
 *                        for each evsel on that CPU. When switching between
 *                        CPUs the affinity is set to the CPU to avoid IPIs
 *                        during syscalls.
 *                        during syscalls. The affinity is set up and removed
 *                        automatically, if the loop is broken a call to
 *                        evlist_cpu_iterator__exit is necessary.
 * @evlist_cpu_itr: the iterator instance.
 * @evlist: evlist instance to iterate.
 * @affinity: NULL or used to set the affinity to the current CPU.
 */
#define evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity)		\
	for ((evlist_cpu_itr) = evlist__cpu_begin(evlist, affinity);	\
#define evlist__for_each_cpu(evlist_cpu_itr, evlist)			\
	for (evlist_cpu_iterator__init(&(evlist_cpu_itr), evlist);	\
	     !evlist_cpu_iterator__end(&evlist_cpu_itr);		\
	     evlist_cpu_iterator__next(&evlist_cpu_itr))

/** Returns an iterator set to the first CPU/evsel of evlist. */
struct evlist_cpu_iterator evlist__cpu_begin(struct evlist *evlist, struct affinity *affinity);
/** Setup an iterator set to the first CPU/evsel of evlist. */
void evlist_cpu_iterator__init(struct evlist_cpu_iterator *itr, struct evlist *evlist);
/**
 * Cleans up the iterator, automatically done by evlist_cpu_iterator__next when
 * the end of the list is reached. Multiple calls are safe.
 */
void evlist_cpu_iterator__exit(struct evlist_cpu_iterator *itr);
/** Move to next element in iterator, updating CPU, evsel and the affinity. */
void evlist_cpu_iterator__next(struct evlist_cpu_iterator *evlist_cpu_itr);
/** Returns true when iterator is at the end of the CPUs and evlist. */
bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr);
static inline bool evlist_cpu_iterator__end(const struct evlist_cpu_iterator *evlist_cpu_itr)
{
	return evlist_cpu_itr->evlist_cpu_map_idx >= evlist_cpu_itr->evlist_cpu_map_nr;
}

struct evsel *evlist__get_tracking_event(struct evlist *evlist);
void evlist__set_tracking_event(struct evlist *evlist, struct evsel *tracking_evsel);
+12 −0
Original line number Diff line number Diff line
@@ -2375,6 +2375,18 @@ bool perf_pmu__is_software(const struct perf_pmu *pmu)
	return false;
}

bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu)
{
	if (!pmu)
		return true; /* Assume is core. */

	/*
	 * All perf event PMUs should benefit from accessing the perf event
	 * contexts on the local CPU.
	 */
	return pmu->type <= PERF_PMU_TYPE_PE_END;
}

FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name)
{
	char path[PATH_MAX];
+1 −0
Original line number Diff line number Diff line
@@ -303,6 +303,7 @@ bool perf_pmu__name_no_suffix_match(const struct perf_pmu *pmu, const char *to_m
 *                        perf_sw_context in the kernel?
 */
bool perf_pmu__is_software(const struct perf_pmu *pmu);
bool perf_pmu__benefits_from_affinity(struct perf_pmu *pmu);

FILE *perf_pmu__open_file(const struct perf_pmu *pmu, const char *name);
FILE *perf_pmu__open_file_at(const struct perf_pmu *pmu, int dirfd, const char *name);