Commit 3a1a66d1 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files
Florian Westpha says:

====================
netfilter pull request nf-25-09-10

First patch adds a lockdep annotation for a false-positive splat.
Last patch adds formal reviewer tag for Phil Sutter to MAINTAINERS.

Rest of the patches resolve spurious false negative results during set
lookups while another CPU is processing a transaction.

This has been broken at least since v4.18 when an unconditional
synchronize_rcu call was removed from the commit phase of nf_tables.

Quoting from Stefan Hanreichs original report:

 It seems like we've found an issue with atomicity when reloading
 nftables rulesets. Sometimes there is a small window where rules
 containing sets do not seem to apply to incoming traffic, due to the set
 apparently being empty for a short amount of time when flushing / adding
 elements.

Exanple ruleset:
table ip filter {
  set match {
    type ipv4_addr
    flags interval
    elements = { 0.0.0.0-192.168.2.19, 192.168.2.21-255.255.255.255 }
  }

  chain pre {
    type filter hook prerouting priority filter; policy accept;
    ip saddr @match accept
    counter comment "must never match"
  }
}

Reproducer transaction:
while true:
nft -f -<<EOF
 flush set ip filter match
 create element ip filter match { \
    0.0.0.0-192.168.2.19, 192.168.2.21-255.255.255.255 }
EOF
done

Then create traffic. to/from e.g. 192.168.2.1 to 192.168.3.10.
Once in a while the counter will increment even though the
'ip saddr @match' rule should have accepted the packet.

See individual patches for details.

Thanks to Stefan Hanreich for an initial description and reproducer for
this bug and to Pablo Neira Ayuso for reviewing earlier iterations of
the patchset.

* tag 'nf-25-09-10-v2' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
  MAINTAINERS: add Phil as netfilter reviewer
  netfilter: nf_tables: restart set lookup on base_seq change
  netfilter: nf_tables: make nft_set_do_lookup available unconditionally
  netfilter: nf_tables: place base_seq in struct net
  netfilter: nft_set_rbtree: continue traversal if element is inactive
  netfilter: nft_set_pipapo: don't check genbit from packetpath lookups
  netfilter: nft_set_bitmap: fix lockdep splat due to missing annotation
====================

Link: https://patch.msgid.link/20250910190308.13356-1-fw@strlen.de


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents ccf78f7f 37a9675e
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -17480,6 +17480,7 @@ NETFILTER
M:	Pablo Neira Ayuso <pablo@netfilter.org>
M:	Jozsef Kadlecsik <kadlec@netfilter.org>
M:	Florian Westphal <fw@strlen.de>
R:	Phil Sutter <phil@nwl.cc>
L:	netfilter-devel@vger.kernel.org
L:	coreteam@netfilter.org
S:	Maintained
+0 −1
Original line number Diff line number Diff line
@@ -1912,7 +1912,6 @@ struct nftables_pernet {
	struct mutex		commit_mutex;
	u64			table_handle;
	u64			tstamp;
	unsigned int		base_seq;
	unsigned int		gc_seq;
	u8			validate_state;
	struct work_struct	destroy_work;
+2 −8
Original line number Diff line number Diff line
@@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
const struct nft_set_ext *
nft_hash_lookup(const struct net *net, const struct nft_set *set,
		const u32 *key);
#endif

const struct nft_set_ext *
nft_set_do_lookup(const struct net *net, const struct nft_set *set,
		  const u32 *key);
#else
static inline const struct nft_set_ext *
nft_set_do_lookup(const struct net *net, const struct nft_set *set,
		  const u32 *key)
{
	return set->ops->lookup(net, set, key);
}
#endif

/* called from nft_pipapo_avx2.c */
const struct nft_set_ext *
+1 −0
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
#define _NETNS_NFTABLES_H_

struct netns_nftables {
	unsigned int		base_seq;
	u8			gencursor;
};

+34 −32
Original line number Diff line number Diff line
@@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
	return ERR_PTR(-ENOENT);
}

static __be16 nft_base_seq(const struct net *net)
static unsigned int nft_base_seq(const struct net *net)
{
	struct nftables_pernet *nft_net = nft_pernet(net);
	return READ_ONCE(net->nft.base_seq);
}

	return htons(nft_net->base_seq & 0xffff);
static __be16 nft_base_seq_be16(const struct net *net)
{
	return htons(nft_base_seq(net) & 0xffff);
}

static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
@@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,

	nlh = nfnl_msg_put(skb, portid, seq,
			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
			   flags, family, NFNETLINK_V0, nft_base_seq(net));
			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,

	nlh = nfnl_msg_put(skb, portid, seq,
			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
			   flags, family, NFNETLINK_V0, nft_base_seq(net));
			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);

	nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
			   nft_base_seq(net));
			   nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb,
	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
			nla_len(nla[NFTA_RULE_TABLE]),
			(char *)nla_data(nla[NFTA_RULE_TABLE]),
			nft_net->base_seq);
			nft_base_seq(net));
	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
			AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
	kfree(buf);
@@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
	nlh = nfnl_msg_put(skb, portid, seq,
			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
			   flags, ctx->family, NFNETLINK_V0,
			   nft_base_seq(ctx->net));
			   nft_base_seq_be16(ctx->net));
	if (!nlh)
		goto nla_put_failure;

@@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (ctx->family != NFPROTO_UNSPEC &&
@@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
@@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
	seq    = cb->nlh->nlmsg_seq;

	nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
			   table->family, NFNETLINK_V0, nft_base_seq(net));
			   table->family, NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,

	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
	nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
			   NFNETLINK_V0, nft_base_seq(ctx->net));
			   NFNETLINK_V0, nft_base_seq_be16(ctx->net));
	if (!nlh)
		goto nla_put_failure;

@@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb,
		}
		nelems++;
	}
	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems);
	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);

out_unlock:
	rcu_read_unlock();
@@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,

	nlh = nfnl_msg_put(skb, portid, seq,
			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
			   flags, family, NFNETLINK_V0, nft_base_seq(net));
			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -8480,7 +8483,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
			idx++;
		}
		if (ctx->reset && entries)
			audit_log_obj_reset(table, nft_net->base_seq, entries);
			audit_log_obj_reset(table, nft_base_seq(net), entries);
		if (rc < 0)
			break;
	}
@@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb,
	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
			nla_len(nla[NFTA_OBJ_TABLE]),
			(char *)nla_data(nla[NFTA_OBJ_TABLE]),
			nft_net->base_seq);
			nft_base_seq(net));
	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
			AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
	kfree(buf);
@@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
		    struct nft_object *obj, u32 portid, u32 seq, int event,
		    u16 flags, int family, int report, gfp_t gfp)
{
	struct nftables_pernet *nft_net = nft_pernet(net);
	char *buf = kasprintf(gfp, "%s:%u",
			      table->name, nft_net->base_seq);
			      table->name, nft_base_seq(net));

	audit_log_nfcfg(buf,
			family,
@@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,

	nlh = nfnl_msg_put(skb, portid, seq,
			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
			   flags, family, NFNETLINK_V0, nft_base_seq(net));
			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

@@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,

	rcu_read_lock();
	nft_net = nft_pernet(net);
	cb->seq = READ_ONCE(nft_net->base_seq);
	cb->seq = nft_base_seq(net);

	list_for_each_entry_rcu(table, &nft_net->tables, list) {
		if (family != NFPROTO_UNSPEC && family != table->family)
@@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
				   u32 portid, u32 seq)
{
	struct nftables_pernet *nft_net = nft_pernet(net);
	struct nlmsghdr *nlh;
	char buf[TASK_COMM_LEN];
	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);

	nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
			   NFNETLINK_V0, nft_base_seq(net));
			   NFNETLINK_V0, nft_base_seq_be16(net));
	if (!nlh)
		goto nla_put_failure;

	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
		goto nla_put_failure;
@@ -10968,11 +10969,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
	 * Bump generation counter, invalidate any dump in progress.
	 * Cannot fail after this point.
	 */
	base_seq = READ_ONCE(nft_net->base_seq);
	base_seq = nft_base_seq(net);
	while (++base_seq == 0)
		;

	WRITE_ONCE(nft_net->base_seq, base_seq);
	/* pairs with smp_load_acquire in nft_lookup_eval */
	smp_store_release(&net->nft.base_seq, base_seq);

	gc_seq = nft_gc_seq_begin(nft_net);

@@ -11181,7 +11183,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)

	nft_commit_notify(net, NETLINK_CB(skb).portid);
	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
	nf_tables_commit_audit_log(&adl, nft_net->base_seq);
	nf_tables_commit_audit_log(&adl, nft_base_seq(net));

	nft_gc_seq_end(nft_net, gc_seq);
	nft_net->validate_state = NFT_VALIDATE_SKIP;
@@ -11506,7 +11508,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid)
	mutex_lock(&nft_net->commit_mutex);
	nft_net->tstamp = get_jiffies_64();

	genid_ok = genid == 0 || nft_net->base_seq == genid;
	genid_ok = genid == 0 || nft_base_seq(net) == genid;
	if (!genid_ok)
		mutex_unlock(&nft_net->commit_mutex);

@@ -12143,7 +12145,7 @@ static int __net_init nf_tables_init_net(struct net *net)
	INIT_LIST_HEAD(&nft_net->module_list);
	INIT_LIST_HEAD(&nft_net->notify_list);
	mutex_init(&nft_net->commit_mutex);
	nft_net->base_seq = 1;
	net->nft.base_seq = 1;
	nft_net->gc_seq = 0;
	nft_net->validate_state = NFT_VALIDATE_SKIP;
	INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
Loading