Commit 4090fa37 authored by Kuniyuki Iwashima's avatar Kuniyuki Iwashima Committed by Jakub Kicinski
Browse files

af_unix: Replace garbage collection algorithm.



If we find a dead SCC during iteration, we call unix_collect_skb()
to splice all skb in the SCC to the global sk_buff_head, hitlist.

After iterating all SCC, we unlock unix_gc_lock and purge the queue.

Signed-off-by: default avatarKuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: default avatarPaolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent a15702d8
Loading
Loading
Loading
Loading
+0 −8
Original line number Diff line number Diff line
@@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)

extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;

void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
void unix_update_edges(struct unix_sock *receiver);
@@ -85,12 +82,7 @@ struct unix_sock {
	struct sock		*peer;
	struct sock		*listener;
	struct unix_vertex	*vertex;
	struct list_head	link;
	unsigned long		inflight;
	spinlock_t		lock;
	unsigned long		gc_flags;
#define UNIX_GC_CANDIDATE	0
#define UNIX_GC_MAYBE_CYCLE	1
	struct socket_wq	peer_wq;
	wait_queue_entry_t	peer_wake;
	struct scm_stat		scm_stat;
+0 −12
Original line number Diff line number Diff line
@@ -980,12 +980,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
	sk->sk_destruct		= unix_sock_destructor;
	u = unix_sk(sk);
	u->listener = NULL;
	u->inflight = 0;
	u->vertex = NULL;
	u->path.dentry = NULL;
	u->path.mnt = NULL;
	spin_lock_init(&u->lock);
	INIT_LIST_HEAD(&u->link);
	mutex_init(&u->iolock); /* single task reading lock */
	mutex_init(&u->bindlock); /* single task binding lock */
	init_waitqueue_head(&u->peer_wait);
@@ -1793,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)

static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
	int i;

	if (too_many_unix_fds(current))
		return -ETOOMANYREFS;

@@ -1806,9 +1802,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
	if (!UNIXCB(skb).fp)
		return -ENOMEM;

	for (i = scm->fp->count - 1; i >= 0; i--)
		unix_inflight(scm->fp->user, scm->fp->fp[i]);

	if (unix_prepare_fpl(UNIXCB(skb).fp))
		return -ENOMEM;

@@ -1817,15 +1810,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)

static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
	int i;

	scm->fp = UNIXCB(skb).fp;
	UNIXCB(skb).fp = NULL;

	unix_destroy_fpl(scm->fp);

	for (i = scm->fp->count - 1; i >= 0; i--)
		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}

static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+64 −238
Original line number Diff line number Diff line
@@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
	return true;
}

enum unix_recv_queue_lock_class {
	U_RECVQ_LOCK_NORMAL,
	U_RECVQ_LOCK_EMBRYO,
};

static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
{
	struct unix_vertex *vertex;

	list_for_each_entry_reverse(vertex, scc, scc_entry) {
		struct sk_buff_head *queue;
		struct unix_edge *edge;
		struct unix_sock *u;

		edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
		u = edge->predecessor;
		queue = &u->sk.sk_receive_queue;

		spin_lock(&queue->lock);

		if (u->sk.sk_state == TCP_LISTEN) {
			struct sk_buff *skb;

			skb_queue_walk(queue, skb) {
				struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;

				/* listener -> embryo order, the inversion never happens. */
				spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
				skb_queue_splice_init(embryo_queue, hitlist);
				spin_unlock(&embryo_queue->lock);
			}
		} else {
			skb_queue_splice_init(queue, hitlist);

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
			if (u->oob_skb) {
				kfree_skb(u->oob_skb);
				u->oob_skb = NULL;
			}
#endif
		}

		spin_unlock(&queue->lock);
	}
}

static bool unix_scc_cyclic(struct list_head *scc)
{
	struct unix_vertex *vertex;
@@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_head *scc)
static LIST_HEAD(unix_visited_vertices);
static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;

static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
			    struct sk_buff_head *hitlist)
{
	LIST_HEAD(vertex_stack);
	struct unix_edge *edge;
@@ -430,7 +477,9 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde
				scc_dead = unix_vertex_dead(vertex);
		}

		if (!unix_graph_maybe_cyclic)
		if (scc_dead)
			unix_collect_skb(&scc, hitlist);
		else if (!unix_graph_maybe_cyclic)
			unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);

		list_del(&scc);
@@ -441,7 +490,7 @@ static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_inde
		goto prev_vertex;
}

static void unix_walk_scc(void)
static void unix_walk_scc(struct sk_buff_head *hitlist)
{
	unsigned long last_index = UNIX_VERTEX_INDEX_START;

@@ -454,7 +503,7 @@ static void unix_walk_scc(void)
		struct unix_vertex *vertex;

		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
		__unix_walk_scc(vertex, &last_index);
		__unix_walk_scc(vertex, &last_index, hitlist);
	}

	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
@@ -463,7 +512,7 @@ static void unix_walk_scc(void)
	unix_graph_grouped = true;
}

static void unix_walk_scc_fast(void)
static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
{
	while (!list_empty(&unix_unvisited_vertices)) {
		struct unix_vertex *vertex;
@@ -480,263 +529,40 @@ static void unix_walk_scc_fast(void)
				scc_dead = unix_vertex_dead(vertex);
		}

		if (scc_dead)
			unix_collect_skb(&scc, hitlist);

		list_del(&scc);
	}

	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
}

static LIST_HEAD(gc_candidates);
static LIST_HEAD(gc_inflight_list);

/* Keep the number of times in flight count for the file
 * descriptor if it is for an AF_UNIX socket.
 */
void unix_inflight(struct user_struct *user, struct file *filp)
{
	struct unix_sock *u = unix_get_socket(filp);

	spin_lock(&unix_gc_lock);

	if (u) {
		if (!u->inflight) {
			WARN_ON_ONCE(!list_empty(&u->link));
			list_add_tail(&u->link, &gc_inflight_list);
		} else {
			WARN_ON_ONCE(list_empty(&u->link));
		}
		u->inflight++;
	}

	spin_unlock(&unix_gc_lock);
}

void unix_notinflight(struct user_struct *user, struct file *filp)
{
	struct unix_sock *u = unix_get_socket(filp);

	spin_lock(&unix_gc_lock);

	if (u) {
		WARN_ON_ONCE(!u->inflight);
		WARN_ON_ONCE(list_empty(&u->link));

		u->inflight--;
		if (!u->inflight)
			list_del_init(&u->link);
	}

	spin_unlock(&unix_gc_lock);
}

static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
			  struct sk_buff_head *hitlist)
{
	struct sk_buff *skb;
	struct sk_buff *next;

	spin_lock(&x->sk_receive_queue.lock);
	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
		/* Do we have file descriptors ? */
		if (UNIXCB(skb).fp) {
			bool hit = false;
			/* Process the descriptors of this socket */
			int nfd = UNIXCB(skb).fp->count;
			struct file **fp = UNIXCB(skb).fp->fp;

			while (nfd--) {
				/* Get the socket the fd matches if it indeed does so */
				struct unix_sock *u = unix_get_socket(*fp++);

				/* Ignore non-candidates, they could have been added
				 * to the queues after starting the garbage collection
				 */
				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
					hit = true;

					func(u);
				}
			}
			if (hit && hitlist != NULL) {
				__skb_unlink(skb, &x->sk_receive_queue);
				__skb_queue_tail(hitlist, skb);
			}
		}
	}
	spin_unlock(&x->sk_receive_queue.lock);
}

static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
			  struct sk_buff_head *hitlist)
{
	if (x->sk_state != TCP_LISTEN) {
		scan_inflight(x, func, hitlist);
	} else {
		struct sk_buff *skb;
		struct sk_buff *next;
		struct unix_sock *u;
		LIST_HEAD(embryos);

		/* For a listening socket collect the queued embryos
		 * and perform a scan on them as well.
		 */
		spin_lock(&x->sk_receive_queue.lock);
		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
			u = unix_sk(skb->sk);

			/* An embryo cannot be in-flight, so it's safe
			 * to use the list link.
			 */
			WARN_ON_ONCE(!list_empty(&u->link));
			list_add_tail(&u->link, &embryos);
		}
		spin_unlock(&x->sk_receive_queue.lock);

		while (!list_empty(&embryos)) {
			u = list_entry(embryos.next, struct unix_sock, link);
			scan_inflight(&u->sk, func, hitlist);
			list_del_init(&u->link);
		}
	}
}

static void dec_inflight(struct unix_sock *usk)
{
	usk->inflight--;
}

static void inc_inflight(struct unix_sock *usk)
{
	usk->inflight++;
}

static void inc_inflight_move_tail(struct unix_sock *u)
{
	u->inflight++;

	/* If this still might be part of a cycle, move it to the end
	 * of the list, so that it's checked even if it was already
	 * passed over
	 */
	if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
		list_move_tail(&u->link, &gc_candidates);
}

static bool gc_in_progress;

static void __unix_gc(struct work_struct *work)
{
	struct sk_buff_head hitlist;
	struct unix_sock *u, *next;
	LIST_HEAD(not_cycle_list);
	struct list_head cursor;

	spin_lock(&unix_gc_lock);

	if (!unix_graph_maybe_cyclic)
	if (!unix_graph_maybe_cyclic) {
		spin_unlock(&unix_gc_lock);
		goto skip_gc;

	if (unix_graph_grouped)
		unix_walk_scc_fast();
	else
		unix_walk_scc();

	/* First, select candidates for garbage collection.  Only
	 * in-flight sockets are considered, and from those only ones
	 * which don't have any external reference.
	 *
	 * Holding unix_gc_lock will protect these candidates from
	 * being detached, and hence from gaining an external
	 * reference.  Since there are no possible receivers, all
	 * buffers currently on the candidates' queues stay there
	 * during the garbage collection.
	 *
	 * We also know that no new candidate can be added onto the
	 * receive queues.  Other, non candidate sockets _can_ be
	 * added to queue, so we must make sure only to touch
	 * candidates.
	 */
	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
		long total_refs;

		total_refs = file_count(u->sk.sk_socket->file);

		WARN_ON_ONCE(!u->inflight);
		WARN_ON_ONCE(total_refs < u->inflight);
		if (total_refs == u->inflight) {
			list_move_tail(&u->link, &gc_candidates);
			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
		}
	}

	/* Now remove all internal in-flight reference to children of
	 * the candidates.
	 */
	list_for_each_entry(u, &gc_candidates, link)
		scan_children(&u->sk, dec_inflight, NULL);
	__skb_queue_head_init(&hitlist);

	/* Restore the references for children of all candidates,
	 * which have remaining references.  Do this recursively, so
	 * only those remain, which form cyclic references.
	 *
	 * Use a "cursor" link, to make the list traversal safe, even
	 * though elements might be moved about.
	 */
	list_add(&cursor, &gc_candidates);
	while (cursor.next != &gc_candidates) {
		u = list_entry(cursor.next, struct unix_sock, link);

		/* Move cursor to after the current position. */
		list_move(&cursor, &u->link);

		if (u->inflight) {
			list_move_tail(&u->link, &not_cycle_list);
			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
			scan_children(&u->sk, inc_inflight_move_tail, NULL);
		}
	}
	list_del(&cursor);

	/* Now gc_candidates contains only garbage.  Restore original
	 * inflight counters for these as well, and remove the skbuffs
	 * which are creating the cycle(s).
	 */
	skb_queue_head_init(&hitlist);
	list_for_each_entry(u, &gc_candidates, link) {
		scan_children(&u->sk, inc_inflight, &hitlist);

#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
		if (u->oob_skb) {
			kfree_skb(u->oob_skb);
			u->oob_skb = NULL;
		}
#endif
	}

	/* not_cycle_list contains those sockets which do not make up a
	 * cycle.  Restore these to the inflight list.
	 */
	while (!list_empty(&not_cycle_list)) {
		u = list_entry(not_cycle_list.next, struct unix_sock, link);
		__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
		list_move_tail(&u->link, &gc_inflight_list);
	}
	if (unix_graph_grouped)
		unix_walk_scc_fast(&hitlist);
	else
		unix_walk_scc(&hitlist);

	spin_unlock(&unix_gc_lock);

	/* Here we are. Hitlist is filled. Die. */
	__skb_queue_purge(&hitlist);

	spin_lock(&unix_gc_lock);

	/* All candidates should have been detached by now. */
	WARN_ON_ONCE(!list_empty(&gc_candidates));
skip_gc:
	/* Paired with READ_ONCE() in wait_for_unix_gc(). */
	WRITE_ONCE(gc_in_progress, false);

	spin_unlock(&unix_gc_lock);
}

static DECLARE_WORK(unix_gc_work, __unix_gc);