Commit 2cf3a3c4 authored by Johannes Berg's avatar Johannes Berg
Browse files

um: time-travel: fix signal blocking race/hang

When signals are hard-blocked in order to do time-travel
socket processing, we set signals_blocked and then handle
SIGIO signals by setting the SIGIO bit in signals_pending.
When unblocking, we first set signals_blocked to 0, and
then handle all pending signals. We have to set it first,
so that we can again properly block/unblock inside the
unblock, if the time-travel handlers need to be processed.

Unfortunately, this is racy. We can get into this situation:

// signals_pending = SIGIO_MASK

unblock_signals_hard()
   signals_blocked = 0;
   if (signals_pending && signals_enabled) {
     block_signals();
     unblock_signals()
       ...
       sig_handler_common(SIGIO, NULL, NULL);
         sigio_handler()
           ...
           sigio_reg_handler()
             irq_do_timetravel_handler()
               reg->timetravel_handler() ==
               vu_req_interrupt_comm_handler()
                 vu_req_read_message()
                   vhost_user_recv_req()
                     vhost_user_recv()
                       vhost_user_recv_header()
                         // reads 12 bytes header of
                         // 20 bytes message
<-- receive SIGIO here <--
sig_handler()
   int enabled = signals_enabled; // 1
   if ((signals_blocked || !enabled) && (sig == SIGIO)) {
     if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL)
       sigio_run_timetravel_handlers()
         _sigio_handler()
           sigio_reg_handler()
             ... as above ...
               vhost_user_recv_header()
                 // reads 8 bytes that were message payload
                 // as if it were header - but aborts since
                 // it then gets -EAGAIN
...
--> end signal handler -->
                       // continue in vhost_user_recv()
                       // full_read() for 8 bytes payload busy loops
                       // entire process hangs here

Conceptually, to fix this, we need to ensure that the
signal handler cannot run while we hard-unblock signals.
The thing that makes this more complex is that we can be
doing hard-block/unblock while unblocking. Introduce a
new signals_blocked_pending variable that we can keep at
non-zero as long as pending signals are being processed,
then we only need to ensure it's decremented safely and
the signal handler will only increment it if it's already
non-zero (or signals_blocked is set, of course.)

Note also that only the outermost call to hard-unblock is
allowed to decrement signals_blocked_pending, since it
could otherwise reach zero in an inner call, and leave
the same race happening if the timetravel_handler loops,
but that's basically required of it.

Fixes: d6b399a0 ("um: time-travel/signals: fix ndelay() in interrupt")
Link: https://patch.msgid.link/20240703110144.28034-2-johannes@sipsolutions.net


Signed-off-by: default avatarJohannes Berg <johannes.berg@intel.com>
parent 45610225
Loading
Loading
Loading
Loading
+98 −20
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@

#include <stdlib.h>
#include <stdarg.h>
#include <stdbool.h>
#include <errno.h>
#include <signal.h>
#include <string.h>
@@ -65,9 +66,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)

int signals_enabled;
#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT
static int signals_blocked;
#else
#define signals_blocked 0
static int signals_blocked, signals_blocked_pending;
#endif
static unsigned int signals_pending;
static unsigned int signals_active = 0;
@@ -76,14 +75,27 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
{
	int enabled = signals_enabled;

	if ((signals_blocked || !enabled) && (sig == SIGIO)) {
#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT
	if ((signals_blocked ||
	     __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) &&
	    (sig == SIGIO)) {
		/* increment so unblock will do another round */
		__atomic_add_fetch(&signals_blocked_pending, 1,
				   __ATOMIC_SEQ_CST);
		return;
	}
#endif

	if (!enabled && (sig == SIGIO)) {
		/*
		 * In TT_MODE_EXTERNAL, need to still call time-travel
		 * handlers unless signals are also blocked for the
		 * external time message processing. This will mark
		 * signals_pending by itself (only if necessary.)
		 * handlers. This will mark signals_pending by itself
		 * (only if necessary.)
		 * Note we won't get here if signals are hard-blocked
		 * (which is handled above), in that case the hard-
		 * unblock will handle things.
		 */
		if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL)
		if (time_travel_mode == TT_MODE_EXTERNAL)
			sigio_run_timetravel_handlers();
		else
			signals_pending |= SIGIO_MASK;
@@ -380,33 +392,99 @@ int um_set_signals_trace(int enable)
#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT
void mark_sigio_pending(void)
{
	/*
	 * It would seem that this should be atomic so
	 * it isn't a read-modify-write with a signal
	 * that could happen in the middle, losing the
	 * value set by the signal.
	 *
	 * However, this function is only called when in
	 * time-travel=ext simulation mode, in which case
	 * the only signal ever pending is SIGIO, which
	 * is blocked while this can be called, and the
	 * timer signal (SIGALRM) cannot happen.
	 */
	signals_pending |= SIGIO_MASK;
}

void block_signals_hard(void)
{
	if (signals_blocked)
		return;
	signals_blocked = 1;
	signals_blocked++;
	barrier();
}

void unblock_signals_hard(void)
{
	static bool unblocking;

	if (!signals_blocked)
		panic("unblocking signals while not blocked");

	if (--signals_blocked)
		return;
	/* Must be set to 0 before we check the pending bits etc. */
	signals_blocked = 0;
	/*
	 * Must be set to 0 before we check pending so the
	 * SIGIO handler will run as normal unless we're still
	 * going to process signals_blocked_pending.
	 */
	barrier();

	if (signals_pending && signals_enabled) {
		/* this is a bit inefficient, but that's not really important */
	/*
	 * Note that block_signals_hard()/unblock_signals_hard() can be called
	 * within the unblock_signals()/sigio_run_timetravel_handlers() below.
	 * This would still be prone to race conditions since it's actually a
	 * call _within_ e.g. vu_req_read_message(), where we observed this
	 * issue, which loops. Thus, if the inner call handles the recorded
	 * pending signals, we can get out of the inner call with the real
	 * signal hander no longer blocked, and still have a race. Thus don't
	 * handle unblocking in the inner call, if it happens, but only in
	 * the outermost call - 'unblocking' serves as an ownership for the
	 * signals_blocked_pending decrement.
	 */
	if (unblocking)
		return;
	unblocking = true;

	while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) {
		if (signals_enabled) {
			/* signals are enabled so we can touch this */
			signals_pending |= SIGIO_MASK;
			/*
			 * this is a bit inefficient, but that's
			 * not really important
			 */
			block_signals();
			unblock_signals();
	} else if (signals_pending & SIGIO_MASK) {
		/* we need to run time-travel handlers even if not enabled */
		} else {
			/*
			 * we need to run time-travel handlers even
			 * if not enabled
			 */
			sigio_run_timetravel_handlers();
		}

		/*
		 * The decrement of signals_blocked_pending must be atomic so
		 * that the signal handler will either happen before or after
		 * the decrement, not during a read-modify-write:
		 *  - If it happens before, it can increment it and we'll
		 *    decrement it and do another round in the loop.
		 *  - If it happens after it'll see 0 for both signals_blocked
		 *    and signals_blocked_pending and thus run the handler as
		 *    usual (subject to signals_enabled, but that's unrelated.)
		 *
		 * Note that a call to unblock_signals_hard() within the calls
		 * to unblock_signals() or sigio_run_timetravel_handlers() above
		 * will do nothing due to the 'unblocking' state, so this cannot
		 * underflow as the only one decrementing will be the outermost
		 * one.
		 */
		if (__atomic_sub_fetch(&signals_blocked_pending, 1,
				       __ATOMIC_SEQ_CST) < 0)
			panic("signals_blocked_pending underflow");
	}

	unblocking = false;
}
#endif