Commit 207b3eba authored by Florian Westphal's avatar Florian Westphal
Browse files

netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation



Ulrich reports a regression with nfqueue:

If an application did not set the 'F_GSO' capability flag and a gso
packet with an unconfirmed nf_conn entry is received all packets are
now dropped instead of queued, because the check happens after
skb_gso_segment().  In that case, we did have exclusive ownership
of the skb and its associated conntrack entry.  The elevated use
count is due to skb_clone happening via skb_gso_segment().

Move the check so that its peformed vs. the aggregated packet.

Then, annotate the individual segments except the first one so we
can do a 2nd check at reinject time.

For the normal case, where userspace does in-order reinjects, this avoids
packet drops: first reinjected segment continues traversal and confirms
entry, remaining segments observe the confirmed entry.

While at it, simplify nf_ct_drop_unconfirmed(): We only care about
unconfirmed entries with a refcnt > 1, there is no need to special-case
dying entries.

This only happens with UDP.  With TCP, the only unconfirmed packet will
be the TCP SYN, those aren't aggregated by GRO.

Next patch adds a udpgro test case to cover this scenario.

Reported-by: default avatarUlrich Weber <ulrich.weber@gmail.com>
Fixes: 7d8dc1c7 ("netfilter: nf_queue: drop packets with cloned unconfirmed conntracks")
Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
parent 35f83a75
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ struct nf_queue_entry {
	struct net_device	*physout;
#endif
	struct nf_hook_state	state;
	bool			nf_ct_is_unconfirmed;
	u16			size; /* sizeof(entry) + saved route keys */
	u16			queue_num;

+74 −49
Original line number Diff line number Diff line
@@ -435,6 +435,34 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
	nf_queue_entry_free(entry);
}

/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
 * exclusively.
 */
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
	struct nf_conn *ct = (void *)skb_nfct(entry->skb);

	if (!ct || nf_ct_is_confirmed(ct))
		return false;

	if (is_unconfirmed)
		*is_unconfirmed = true;

	/* in some cases skb_clone() can occur after initial conntrack
	 * pickup, but conntrack assumes exclusive skb->_nfct ownership for
	 * unconfirmed entries.
	 *
	 * This happens for br_netfilter and with ip multicast routing.
	 * This can't be solved with serialization here because one clone
	 * could have been queued for local delivery or could be transmitted
	 * in parallel on another CPU.
	 */
	return refcount_read(&ct->ct_general.use) > 1;
#endif
	return false;
}

static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
	const struct nf_ct_hook *ct_hook;
@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
			break;
		}
	}

	if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
		/* If first queued segment was already reinjected then
		 * there is a good chance the ct entry is now confirmed.
		 *
		 * Handle the rare cases:
		 *  - out-of-order verdict
		 *  - threaded userspace reinjecting in parallel
		 *  - first segment was dropped
		 *
		 * In all of those cases we can't handle this packet
		 * because we can't be sure that another CPU won't modify
		 * nf_conn->ext in parallel which isn't allowed.
		 */
		if (nf_ct_drop_unconfirmed(entry, NULL))
			verdict = NF_DROP;
	}

	nf_reinject(entry, verdict);
}

@@ -891,49 +937,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
	return NULL;
}

static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
	static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
	struct nf_conn *ct = (void *)skb_nfct(entry->skb);
	unsigned long status;
	unsigned int use;

	if (!ct)
		return false;

	status = READ_ONCE(ct->status);
	if ((status & flags) == IPS_DYING)
		return true;

	if (status & IPS_CONFIRMED)
		return false;

	/* in some cases skb_clone() can occur after initial conntrack
	 * pickup, but conntrack assumes exclusive skb->_nfct ownership for
	 * unconfirmed entries.
	 *
	 * This happens for br_netfilter and with ip multicast routing.
	 * We can't be solved with serialization here because one clone could
	 * have been queued for local delivery.
	 */
	use = refcount_read(&ct->ct_general.use);
	if (likely(use == 1))
		return false;

	/* Can't decrement further? Exclusive ownership. */
	if (!refcount_dec_not_one(&ct->ct_general.use))
		return false;

	skb_set_nfct(entry->skb, 0);
	/* No nf_ct_put(): we already decremented .use and it cannot
	 * drop down to 0.
	 */
	return true;
#endif
	return false;
}

static int
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
			struct nf_queue_entry *entry)
@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
	}
	spin_lock_bh(&queue->lock);

	if (nf_ct_drop_unconfirmed(entry))
		goto err_out_free_nskb;

	if (queue->queue_total >= queue->queue_maxlen)
		goto err_out_queue_drop;

@@ -995,7 +995,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
		else
			net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
	}
err_out_free_nskb:
	kfree_skb(nskb);
err_out_unlock:
	spin_unlock_bh(&queue->lock);
@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
static int
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
	unsigned int queued;
	struct nfqnl_instance *queue;
	struct sk_buff *skb, *segs, *nskb;
	bool ct_is_unconfirmed = false;
	struct nfqnl_instance *queue;
	unsigned int queued;
	int err = -ENOBUFS;
	struct net *net = entry->state.net;
	struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
		break;
	}

	/* Check if someone already holds another reference to
	 * unconfirmed ct.  If so, we cannot queue the skb:
	 * concurrent modifications of nf_conn->ext are not
	 * allowed and we can't know if another CPU isn't
	 * processing the same nf_conn entry in parallel.
	 */
	if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
		return -EINVAL;

	if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
		return __nfqnl_enqueue_packet(net, queue, entry);

@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
		goto out_err;
	queued = 0;
	err = 0;

	skb_list_walk_safe(segs, segs, nskb) {
		if (ct_is_unconfirmed && queued > 0) {
			/* skb_gso_segment() increments the ct refcount.
			 * This is a problem for unconfirmed (not in hash)
			 * entries, those can race when reinjections happen
			 * in parallel.
			 *
			 * Annotate this for all queued entries except the
			 * first one.
			 *
			 * As long as the first one is reinjected first it
			 * will do the confirmation for us.
			 */
			entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
		}

		if (err == 0)
			err = __nfqnl_enqueue_packet_gso(net, queue,
							segs, entry);