netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation (207b3eba) · Commits · git / linux-net

include/net/netfilter/nf_queue.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@ struct nf_queue_entry {
		struct net_device *physout;
		#endif
		struct nf_hook_state state;
		bool nf_ct_is_unconfirmed;
		u16 size; /* sizeof(entry) + saved route keys */
		u16 queue_num;

net/netfilter/nfnetlink_queue.c

+74 −49

Original line number	Diff line number	Diff line
		@@ -435,6 +435,34 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
		nf_queue_entry_free(entry);
		}

		/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
		* exclusively.
		*/
		static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry entry, bool is_unconfirmed)
		{
		#if IS_ENABLED(CONFIG_NF_CONNTRACK)
		struct nf_conn ct = (void )skb_nfct(entry->skb);

		if (!ct \|\| nf_ct_is_confirmed(ct))
		return false;

		if (is_unconfirmed)
		*is_unconfirmed = true;

		/* in some cases skb_clone() can occur after initial conntrack
		* pickup, but conntrack assumes exclusive skb->_nfct ownership for
		* unconfirmed entries.
		*
		* This happens for br_netfilter and with ip multicast routing.
		* This can't be solved with serialization here because one clone
		* could have been queued for local delivery or could be transmitted
		* in parallel on another CPU.
		*/
		return refcount_read(&ct->ct_general.use) > 1;
		#endif
		return false;
		}

		static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
		{
		const struct nf_ct_hook *ct_hook;
		@@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
		break;
		}
		}

		if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
		/* If first queued segment was already reinjected then
		* there is a good chance the ct entry is now confirmed.
		*
		* Handle the rare cases:
		* - out-of-order verdict
		* - threaded userspace reinjecting in parallel
		* - first segment was dropped
		*
		* In all of those cases we can't handle this packet
		* because we can't be sure that another CPU won't modify
		* nf_conn->ext in parallel which isn't allowed.
		*/
		if (nf_ct_drop_unconfirmed(entry, NULL))
		verdict = NF_DROP;
		}

		nf_reinject(entry, verdict);
		}

		@@ -891,49 +937,6 @@ nfqnl_build_packet_message(struct net net, struct nfqnl_instance queue,
		return NULL;
		}

		static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
		{
		#if IS_ENABLED(CONFIG_NF_CONNTRACK)
		static const unsigned long flags = IPS_CONFIRMED \| IPS_DYING;
		struct nf_conn ct = (void )skb_nfct(entry->skb);
		unsigned long status;
		unsigned int use;

		if (!ct)
		return false;

		status = READ_ONCE(ct->status);
		if ((status & flags) == IPS_DYING)
		return true;

		if (status & IPS_CONFIRMED)
		return false;

		/* in some cases skb_clone() can occur after initial conntrack
		* pickup, but conntrack assumes exclusive skb->_nfct ownership for
		* unconfirmed entries.
		*
		* This happens for br_netfilter and with ip multicast routing.
		* We can't be solved with serialization here because one clone could
		* have been queued for local delivery.
		*/
		use = refcount_read(&ct->ct_general.use);
		if (likely(use == 1))
		return false;

		/* Can't decrement further? Exclusive ownership. */
		if (!refcount_dec_not_one(&ct->ct_general.use))
		return false;

		skb_set_nfct(entry->skb, 0);
		/* No nf_ct_put(): we already decremented .use and it cannot
		* drop down to 0.
		*/
		return true;
		#endif
		return false;
		}

		static int
		__nfqnl_enqueue_packet(struct net net, struct nfqnl_instance queue,
		struct nf_queue_entry *entry)
		@@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net net, struct nfqnl_instance queue,
		}
		spin_lock_bh(&queue->lock);

		if (nf_ct_drop_unconfirmed(entry))
		goto err_out_free_nskb;

		if (queue->queue_total >= queue->queue_maxlen)
		goto err_out_queue_drop;

		@@ -995,7 +995,6 @@ __nfqnl_enqueue_packet(struct net net, struct nfqnl_instance queue,
		else
		net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
		}
		err_out_free_nskb:
		kfree_skb(nskb);
		err_out_unlock:
		spin_unlock_bh(&queue->lock);
		@@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net net, struct nfqnl_instance queue,
		static int
		nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
		{
		unsigned int queued;
		struct nfqnl_instance *queue;
		struct sk_buff skb, segs, *nskb;
		bool ct_is_unconfirmed = false;
		struct nfqnl_instance *queue;
		unsigned int queued;
		int err = -ENOBUFS;
		struct net *net = entry->state.net;
		struct nfnl_queue_net *q = nfnl_queue_pernet(net);
		@@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
		break;
		}

		/* Check if someone already holds another reference to
		* unconfirmed ct. If so, we cannot queue the skb:
		* concurrent modifications of nf_conn->ext are not
		* allowed and we can't know if another CPU isn't
		* processing the same nf_conn entry in parallel.
		*/
		if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
		return -EINVAL;

		if (!skb_is_gso(skb) \|\| ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
		return __nfqnl_enqueue_packet(net, queue, entry);

		@@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
		goto out_err;
		queued = 0;
		err = 0;

		skb_list_walk_safe(segs, segs, nskb) {
		if (ct_is_unconfirmed && queued > 0) {
		/* skb_gso_segment() increments the ct refcount.
		* This is a problem for unconfirmed (not in hash)
		* entries, those can race when reinjections happen
		* in parallel.
		*
		* Annotate this for all queued entries except the
		* first one.
		*
		* As long as the first one is reinjected first it
		* will do the confirmation for us.
		*/
		entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
		}

		if (err == 0)
		err = __nfqnl_enqueue_packet_gso(net, queue,
		segs, entry);