Commit cd912ab3 authored by Madadi Vineeth Reddy's avatar Madadi Vineeth Reddy Committed by Namhyung Kim
Browse files

perf sched timehist: Add pre-migration wait time option



pre-migration wait time is the time that a task unnecessarily spends
on the runqueue of a CPU but doesn't get switched-in there. In terms
of tracepoints, it is the time between sched:sched_wakeup and
sched:sched_migrate_task.

Let's say a task woke up on CPU2, then it got migrated to CPU4 and
then it's switched-in to CPU4. So, here pre-migration wait time is
time that it was waiting on runqueue of CPU2 after it is woken up.

The general pattern for pre-migration to occur is:
sched:sched_wakeup
sched:sched_migrate_task
sched:sched_switch

The sched:sched_waking event is used to capture the wakeup time,
as it aligns with the existing code and only introduces a negligible
time difference.

pre-migrations are generally not useful and it increases migrations.
This metric would be helpful in testing patches mainly related to wakeup
and load-balancer code paths as better wakeup logic would choose an
optimal CPU where task would be switched-in and thereby reducing pre-
migrations.

The sample output(s) when -P or --pre-migrations is used:
=================
           time    cpu  task name                       wait time  sch delay   run time  pre-mig time
                        [tid/pid]                          (msec)     (msec)     (msec)     (msec)
--------------- ------  ------------------------------  ---------  ---------  ---------  ---------
   38456.720806 [0001]  schbench[28634/28574]               4.917      4.768      1.004      0.000
   38456.720810 [0001]  rcu_preempt[18]                     3.919      0.003      0.004      0.000
   38456.721800 [0006]  schbench[28779/28574]              23.465     23.465      1.999      0.000
   38456.722800 [0002]  schbench[28773/28574]              60.371     60.237      3.955     60.197
   38456.722806 [0001]  schbench[28634/28574]               0.004      0.004      1.996      0.000
   38456.722811 [0001]  rcu_preempt[18]                     1.996      0.005      0.005      0.000
   38456.723800 [0000]  schbench[28833/28574]               4.000      4.000      3.999      0.000
   38456.723800 [0004]  schbench[28762/28574]              42.951     42.839      3.999     39.867
   38456.723802 [0007]  schbench[28812/28574]              43.947     43.817      3.999     40.866
   38456.723804 [0001]  schbench[28587/28574]               7.935      7.822      0.993      0.000

Signed-off-by: default avatarMadadi Vineeth Reddy <vineethr@linux.ibm.com>
Reviewed-by: default avatarTim Chen <tim.c.chen@linux.intel.com>
Link: https://lore.kernel.org/r/20241004170756.18064-1-vineethr@linux.ibm.com


Signed-off-by: default avatarNamhyung Kim <namhyung@kernel.org>
parent af3902bf
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -221,6 +221,14 @@ OPTIONS for 'perf sched timehist'
	priorities are specified with -: 120-129. A combination of both can also be
	provided: 0,120-129.

-P::
--pre-migrations::
	Show pre-migration wait time. pre-migration wait time is the time spent
	by a task waiting on a runqueue but not getting the chance to run there
	and is migrated to a different runqueue where it is finally run. This
	time between sched_wakeup and migrate_task is the pre-migration wait
	time.

OPTIONS for 'perf sched replay'
------------------------------

+59 −39
Original line number Diff line number Diff line
@@ -225,6 +225,7 @@ struct perf_sched {
	bool		show_wakeups;
	bool		show_next;
	bool		show_migrations;
	bool		pre_migrations;
	bool		show_state;
	bool		show_prio;
	u64		skipped_samples;
@@ -244,7 +245,9 @@ struct thread_runtime {
	u64 dt_iowait;      /* time between CPU access by iowait (off cpu) */
	u64 dt_preempt;     /* time between CPU access by preempt (off cpu) */
	u64 dt_delay;       /* time between wakeup and sched-in */
	u64 dt_pre_mig;     /* time between migration and wakeup */
	u64 ready_to_run;   /* time of wakeup */
	u64 migrated;	    /* time when a thread is migrated */

	struct stats run_stats;
	u64 total_run_time;
@@ -252,6 +255,7 @@ struct thread_runtime {
	u64 total_iowait_time;
	u64 total_preempt_time;
	u64 total_delay_time;
	u64 total_pre_mig_time;

	char last_state;

@@ -2073,14 +2077,15 @@ static void timehist_header(struct perf_sched *sched)
		printf(" ");
	}

	if (sched->show_prio) {
		printf(" %-*s  %-*s  %9s  %9s  %9s",
		       comm_width, "task name", MAX_PRIO_STR_LEN, "prio",
		       "wait time", "sch delay", "run time");
	} else {
		printf(" %-*s  %9s  %9s  %9s", comm_width,
		       "task name", "wait time", "sch delay", "run time");
	}
	printf(" %-*s", comm_width, "task name");

	if (sched->show_prio)
		printf("  %-*s", MAX_PRIO_STR_LEN, "prio");

	printf("  %9s  %9s  %9s", "wait time", "sch delay", "run time");

	if (sched->pre_migrations)
		printf("  %9s", "pre-mig time");

	if (sched->show_state)
		printf("  %s", "state");
@@ -2095,17 +2100,15 @@ static void timehist_header(struct perf_sched *sched)
	if (sched->show_cpu_visual)
		printf(" %*s ", ncpus, "");

	if (sched->show_prio) {
		printf(" %-*s  %-*s  %9s  %9s  %9s",
		       comm_width, "[tid/pid]", MAX_PRIO_STR_LEN, "",
		       "(msec)", "(msec)", "(msec)");
	} else {
		printf(" %-*s  %9s  %9s  %9s", comm_width,
		       "[tid/pid]", "(msec)", "(msec)", "(msec)");
	}
	printf(" %-*s", comm_width, "[tid/pid]");

	if (sched->show_state)
		printf("  %5s", "");
	if (sched->show_prio)
		printf("  %-*s", MAX_PRIO_STR_LEN, "");

	printf("  %9s  %9s  %9s", "(msec)", "(msec)", "(msec)");

	if (sched->pre_migrations)
		printf("  %9s", "(msec)");

	printf("\n");

@@ -2117,15 +2120,15 @@ static void timehist_header(struct perf_sched *sched)
	if (sched->show_cpu_visual)
		printf(" %.*s ", ncpus, graph_dotted_line);

	if (sched->show_prio) {
		printf(" %.*s  %.*s  %.9s  %.9s  %.9s",
		       comm_width, graph_dotted_line, MAX_PRIO_STR_LEN, graph_dotted_line,
		       graph_dotted_line, graph_dotted_line, graph_dotted_line);
	} else {
		printf(" %.*s  %.9s  %.9s  %.9s", comm_width,
		       graph_dotted_line, graph_dotted_line, graph_dotted_line,
		       graph_dotted_line);
	}
	printf(" %.*s", comm_width, graph_dotted_line);

	if (sched->show_prio)
		printf("  %.*s", MAX_PRIO_STR_LEN, graph_dotted_line);

	printf("  %.9s  %.9s  %.9s", graph_dotted_line, graph_dotted_line, graph_dotted_line);

	if (sched->pre_migrations)
		printf("  %.9s", graph_dotted_line);

	if (sched->show_state)
		printf("  %.5s", graph_dotted_line);
@@ -2180,6 +2183,8 @@ static void timehist_print_sample(struct perf_sched *sched,

	print_sched_time(tr->dt_delay, 6);
	print_sched_time(tr->dt_run, 6);
	if (sched->pre_migrations)
		print_sched_time(tr->dt_pre_mig, 6);

	if (sched->show_state)
		printf(" %5c ", thread__tid(thread) == 0 ? 'I' : state);
@@ -2217,18 +2222,21 @@ static void timehist_print_sample(struct perf_sched *sched,
 *    last_time = time of last sched change event for current task
 *                (i.e, time process was last scheduled out)
 * ready_to_run = time of wakeup for current task
 *     migrated = time of task migration to another CPU
 *
 * -----|------------|------------|------------|------
 *    last         ready        tprev          t
 * -----|-------------|-------------|-------------|-------------|-----
 *    last         ready         migrated       tprev           t
 *    time         to run
 *
 *      |-------- dt_wait --------|
 *                   |- dt_delay -|-- dt_run --|
 *      |---------------- dt_wait ----------------|
 *                   |--------- dt_delay ---------|-- dt_run --|
 *                   |- dt_pre_mig -|
 *
 *     dt_run = run time of current task
 *    dt_wait = time between last schedule out event for task and tprev
 *              represents time spent off the cpu
 *   dt_delay = time between wakeup and schedule-in of task
 * dt_pre_mig = time between wakeup and migration to another CPU
 */

static void timehist_update_runtime_stats(struct thread_runtime *r,
@@ -2239,6 +2247,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
	r->dt_iowait  = 0;
	r->dt_preempt = 0;
	r->dt_run     = 0;
	r->dt_pre_mig = 0;

	if (tprev) {
		r->dt_run = t - tprev;
@@ -2247,6 +2256,9 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
				pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
			else
				r->dt_delay = tprev - r->ready_to_run;

			if ((r->migrated > r->ready_to_run) && (r->migrated < tprev))
				r->dt_pre_mig = r->migrated - r->ready_to_run;
		}

		if (r->last_time > tprev)
@@ -2270,6 +2282,7 @@ static void timehist_update_runtime_stats(struct thread_runtime *r,
	r->total_sleep_time   += r->dt_sleep;
	r->total_iowait_time  += r->dt_iowait;
	r->total_preempt_time += r->dt_preempt;
	r->total_pre_mig_time += r->dt_pre_mig;
}

static bool is_idle_sample(struct perf_sample *sample,
@@ -2683,9 +2696,13 @@ static int timehist_migrate_task_event(const struct perf_tool *tool,
		return -1;

	tr->migrations++;
	tr->migrated = sample->time;

	/* show migrations if requested */
	timehist_print_migration_event(sched, evsel, sample, machine, thread);
	if (sched->show_migrations) {
		timehist_print_migration_event(sched, evsel, sample,
							machine, thread);
	}

	return 0;
}
@@ -2836,11 +2853,13 @@ static int timehist_sched_change_event(const struct perf_tool *tool,
		/* last state is used to determine where to account wait time */
		tr->last_state = state;

		/* sched out event for task so reset ready to run time */
		/* sched out event for task so reset ready to run time and migrated time */
		if (state == 'R')
			tr->ready_to_run = t;
		else
			tr->ready_to_run = 0;

		tr->migrated = 0;
	}

	evsel__save_time(evsel, sample->time, sample->cpu);
@@ -3280,7 +3299,7 @@ static int perf_sched__timehist(struct perf_sched *sched)
		goto out;
	}

	if (sched->show_migrations &&
	if ((sched->show_migrations || sched->pre_migrations) &&
		perf_session__set_tracepoints_handlers(session, migrate_handlers))
		goto out;

@@ -3823,6 +3842,7 @@ int cmd_sched(int argc, const char **argv)
	OPT_BOOLEAN(0, "show-prio", &sched.show_prio, "Show task priority"),
	OPT_STRING(0, "prio", &sched.prio_str, "prio",
		   "analyze events only for given task priority(ies)"),
	OPT_BOOLEAN('P', "pre-migrations", &sched.pre_migrations, "Show pre-migration wait time"),
	OPT_PARENT(sched_options)
	};