Commit 0d21e668 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull aio poll fixes from Eric Biggers:
 "Fix three bugs in aio poll, and one issue with POLLFREE more broadly:

   - aio poll didn't handle POLLFREE, causing a use-after-free.

   - aio poll could block while the file is ready.

   - aio poll called eventfd_signal() when it isn't allowed.

   - POLLFREE didn't handle multiple exclusive waiters correctly.

  This has been tested with the libaio test suite, as well as with test
  programs I wrote that reproduce the first two bugs. I am sending this
  pull request myself as no one seems to be maintaining this code"

* tag 'aio-poll-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux:
  aio: Fix incorrect usage of eventfd_signal_allowed()
  aio: fix use-after-free due to missing POLLFREE handling
  aio: keep poll requests on waitqueue until completed
  signalfd: use wake_up_pollfree()
  binder: use wake_up_pollfree()
  wait: add wake_up_pollfree()
parents b9172f9e 4b374986
Loading
Loading
Loading
Loading
+9 −12
Original line number Diff line number Diff line
@@ -4422,23 +4422,20 @@ static int binder_thread_release(struct binder_proc *proc,
	__release(&t->lock);

	/*
	 * If this thread used poll, make sure we remove the waitqueue
	 * from any epoll data structures holding it with POLLFREE.
	 * waitqueue_active() is safe to use here because we're holding
	 * the inner lock.
	 * If this thread used poll, make sure we remove the waitqueue from any
	 * poll data structures holding it.
	 */
	if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
	    waitqueue_active(&thread->wait)) {
		wake_up_poll(&thread->wait, EPOLLHUP | POLLFREE);
	}
	if (thread->looper & BINDER_LOOPER_STATE_POLL)
		wake_up_pollfree(&thread->wait);

	binder_inner_proc_unlock(thread->proc);

	/*
	 * This is needed to avoid races between wake_up_poll() above and
	 * and ep_remove_waitqueue() called for other reasons (eg the epoll file
	 * descriptor being closed); ep_remove_waitqueue() holds an RCU read
	 * lock, so we can be sure it's done after calling synchronize_rcu().
	 * This is needed to avoid races between wake_up_pollfree() above and
	 * someone else removing the last entry from the queue for other reasons
	 * (e.g. ep_remove_wait_queue() being called due to an epoll file
	 * descriptor being closed).  Such other users hold an RCU read lock, so
	 * we can be sure they're done after we call synchronize_rcu().
	 */
	if (thread->looper & BINDER_LOOPER_STATE_POLL)
		synchronize_rcu();
+152 −34
Original line number Diff line number Diff line
@@ -181,8 +181,9 @@ struct poll_iocb {
	struct file		*file;
	struct wait_queue_head	*head;
	__poll_t		events;
	bool			done;
	bool			cancelled;
	bool			work_scheduled;
	bool			work_need_resched;
	struct wait_queue_entry	wait;
	struct work_struct	work;
};
@@ -1619,6 +1620,51 @@ static void aio_poll_put_work(struct work_struct *work)
	iocb_put(iocb);
}

/*
 * Safely lock the waitqueue which the request is on, synchronizing with the
 * case where the ->poll() provider decides to free its waitqueue early.
 *
 * Returns true on success, meaning that req->head->lock was locked, req->wait
 * is on req->head, and an RCU read lock was taken.  Returns false if the
 * request was already removed from its waitqueue (which might no longer exist).
 */
static bool poll_iocb_lock_wq(struct poll_iocb *req)
{
	wait_queue_head_t *head;

	/*
	 * While we hold the waitqueue lock and the waitqueue is nonempty,
	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
	 * lock in the first place can race with the waitqueue being freed.
	 *
	 * We solve this as eventpoll does: by taking advantage of the fact that
	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
	 * we enter rcu_read_lock() and see that the pointer to the queue is
	 * non-NULL, we can then lock it without the memory being freed out from
	 * under us, then check whether the request is still on the queue.
	 *
	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
	 * case the caller deletes the entry from the queue, leaving it empty.
	 * In that case, only RCU prevents the queue memory from being freed.
	 */
	rcu_read_lock();
	head = smp_load_acquire(&req->head);
	if (head) {
		spin_lock(&head->lock);
		if (!list_empty(&req->wait.entry))
			return true;
		spin_unlock(&head->lock);
	}
	rcu_read_unlock();
	return false;
}

static void poll_iocb_unlock_wq(struct poll_iocb *req)
{
	spin_unlock(&req->head->lock);
	rcu_read_unlock();
}

static void aio_poll_complete_work(struct work_struct *work)
{
	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1638,14 +1684,27 @@ static void aio_poll_complete_work(struct work_struct *work)
	 * avoid further branches in the fast path.
	 */
	spin_lock_irq(&ctx->ctx_lock);
	if (poll_iocb_lock_wq(req)) {
		if (!mask && !READ_ONCE(req->cancelled)) {
		add_wait_queue(req->head, &req->wait);
			/*
			 * The request isn't actually ready to be completed yet.
			 * Reschedule completion if another wakeup came in.
			 */
			if (req->work_need_resched) {
				schedule_work(&req->work);
				req->work_need_resched = false;
			} else {
				req->work_scheduled = false;
			}
			poll_iocb_unlock_wq(req);
			spin_unlock_irq(&ctx->ctx_lock);
			return;
		}
		list_del_init(&req->wait.entry);
		poll_iocb_unlock_wq(req);
	} /* else, POLLFREE has freed the waitqueue, so we must complete */
	list_del_init(&iocb->ki_list);
	iocb->ki_res.res = mangle_poll(mask);
	req->done = true;
	spin_unlock_irq(&ctx->ctx_lock);

	iocb_put(iocb);
@@ -1657,13 +1716,14 @@ static int aio_poll_cancel(struct kiocb *iocb)
	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
	struct poll_iocb *req = &aiocb->poll;

	spin_lock(&req->head->lock);
	if (poll_iocb_lock_wq(req)) {
		WRITE_ONCE(req->cancelled, true);
	if (!list_empty(&req->wait.entry)) {
		list_del_init(&req->wait.entry);
		if (!req->work_scheduled) {
			schedule_work(&aiocb->poll.work);
			req->work_scheduled = true;
		}
	spin_unlock(&req->head->lock);
		poll_iocb_unlock_wq(req);
	} /* else, the request was force-cancelled by POLLFREE already */

	return 0;
}
@@ -1680,21 +1740,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
	if (mask && !(mask & req->events))
		return 0;

	list_del_init(&req->wait.entry);

	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
		struct kioctx *ctx = iocb->ki_ctx;

	/*
		 * Try to complete the iocb inline if we can. Use
		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
		 * call this function with IRQs disabled and because IRQs
		 * have to be disabled before ctx_lock is obtained.
	 * Complete the request inline if possible.  This requires that three
	 * conditions be met:
	 *   1. An event mask must have been passed.  If a plain wakeup was done
	 *	instead, then mask == 0 and we have to call vfs_poll() to get
	 *	the events, so inline completion isn't possible.
	 *   2. The completion work must not have already been scheduled.
	 *   3. ctx_lock must not be busy.  We have to use trylock because we
	 *	already hold the waitqueue lock, so this inverts the normal
	 *	locking order.  Use irqsave/irqrestore because not all
	 *	filesystems (e.g. fuse) call this function with IRQs disabled,
	 *	yet IRQs have to be disabled before ctx_lock is obtained.
	 */
	if (mask && !req->work_scheduled &&
	    spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
		struct kioctx *ctx = iocb->ki_ctx;

		list_del_init(&req->wait.entry);
		list_del(&iocb->ki_list);
		iocb->ki_res.res = mangle_poll(mask);
		req->done = true;
		if (iocb->ki_eventfd && eventfd_signal_allowed()) {
		if (iocb->ki_eventfd && !eventfd_signal_allowed()) {
			iocb = NULL;
			INIT_WORK(&req->work, aio_poll_put_work);
			schedule_work(&req->work);
@@ -1702,8 +1768,44 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
		if (iocb)
			iocb_put(iocb);
	} else {
		/*
		 * Schedule the completion work if needed.  If it was already
		 * scheduled, record that another wakeup came in.
		 *
		 * Don't remove the request from the waitqueue here, as it might
		 * not actually be complete yet (we won't know until vfs_poll()
		 * is called), and we must not miss any wakeups.  POLLFREE is an
		 * exception to this; see below.
		 */
		if (req->work_scheduled) {
			req->work_need_resched = true;
		} else {
			schedule_work(&req->work);
			req->work_scheduled = true;
		}

		/*
		 * If the waitqueue is being freed early but we can't complete
		 * the request inline, we have to tear down the request as best
		 * we can.  That means immediately removing the request from its
		 * waitqueue and preventing all further accesses to the
		 * waitqueue via the request.  We also need to schedule the
		 * completion work (done above).  Also mark the request as
		 * cancelled, to potentially skip an unneeded call to ->poll().
		 */
		if (mask & POLLFREE) {
			WRITE_ONCE(req->cancelled, true);
			list_del_init(&req->wait.entry);

			/*
			 * Careful: this *must* be the last step, since as soon
			 * as req->head is NULL'ed out, the request can be
			 * completed and freed, since aio_poll_complete_work()
			 * will no longer need to take the waitqueue lock.
			 */
			smp_store_release(&req->head, NULL);
		}
	}
	return 1;
}
@@ -1711,6 +1813,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct aio_poll_table {
	struct poll_table_struct	pt;
	struct aio_kiocb		*iocb;
	bool				queued;
	int				error;
};

@@ -1721,11 +1824,12 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);

	/* multiple wait queues per file are not supported */
	if (unlikely(pt->iocb->poll.head)) {
	if (unlikely(pt->queued)) {
		pt->error = -EINVAL;
		return;
	}

	pt->queued = true;
	pt->error = 0;
	pt->iocb->poll.head = head;
	add_wait_queue(head, &pt->iocb->poll.wait);
@@ -1750,12 +1854,14 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;

	req->head = NULL;
	req->done = false;
	req->cancelled = false;
	req->work_scheduled = false;
	req->work_need_resched = false;

	apt.pt._qproc = aio_poll_queue_proc;
	apt.pt._key = req->events;
	apt.iocb = aiocb;
	apt.queued = false;
	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */

	/* initialized the list so that we can do list_empty checks */
@@ -1764,23 +1870,35 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)

	mask = vfs_poll(req->file, &apt.pt) & req->events;
	spin_lock_irq(&ctx->ctx_lock);
	if (likely(req->head)) {
		spin_lock(&req->head->lock);
		if (unlikely(list_empty(&req->wait.entry))) {
			if (apt.error)
	if (likely(apt.queued)) {
		bool on_queue = poll_iocb_lock_wq(req);

		if (!on_queue || req->work_scheduled) {
			/*
			 * aio_poll_wake() already either scheduled the async
			 * completion work, or completed the request inline.
			 */
			if (apt.error) /* unsupported case: multiple queues */
				cancel = true;
			apt.error = 0;
			mask = 0;
		}
		if (mask || apt.error) {
			/* Steal to complete synchronously. */
			list_del_init(&req->wait.entry);
		} else if (cancel) {
			/* Cancel if possible (may be too late though). */
			WRITE_ONCE(req->cancelled, true);
		} else if (!req->done) { /* actually waiting for an event */
		} else if (on_queue) {
			/*
			 * Actually waiting for an event, so add the request to
			 * active_reqs so that it can be cancelled if needed.
			 */
			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
			aiocb->ki_cancel = aio_poll_cancel;
		}
		spin_unlock(&req->head->lock);
		if (on_queue)
			poll_iocb_unlock_wq(req);
	}
	if (mask) { /* no async, we'd stolen it */
		aiocb->ki_res.res = mangle_poll(mask);
+1 −11
Original line number Diff line number Diff line
@@ -35,17 +35,7 @@

void signalfd_cleanup(struct sighand_struct *sighand)
{
	wait_queue_head_t *wqh = &sighand->signalfd_wqh;
	/*
	 * The lockless check can race with remove_wait_queue() in progress,
	 * but in this case its caller should run under rcu_read_lock() and
	 * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
	 */
	if (likely(!waitqueue_active(wqh)))
		return;

	/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
	wake_up_poll(wqh, EPOLLHUP | POLLFREE);
	wake_up_pollfree(&sighand->signalfd_wqh);
}

struct signalfd_ctx {
+26 −0
Original line number Diff line number Diff line
@@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
#define wake_up_interruptible_sync_poll_locked(x, m)				\
	__wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
	/*
	 * For performance reasons, we don't always take the queue lock here.
	 * Therefore, we might race with someone removing the last entry from
	 * the queue, and proceed while they still hold the queue lock.
	 * However, rcu_read_lock() is required to be held in such cases, so we
	 * can safely proceed with an RCU-delayed free.
	 */
	if (waitqueue_active(wq_head))
		__wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)						\
({										\
	bool __cond = (condition);						\
+1 −1
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@
#define POLLRDHUP       0x2000
#endif

#define POLLFREE	(__force __poll_t)0x4000	/* currently only for epoll */
#define POLLFREE	(__force __poll_t)0x4000

#define POLL_BUSY_LOOP	(__force __poll_t)0x8000

Loading