Commit e96f6fd3 authored by Jakub Kicinski's avatar Jakub Kicinski
Browse files

Merge branch 'net-nexthop-increase-weight-to-u16'

Petr Machata says:

====================
net: nexthop: Increase weight to u16

In CLOS networks, as link failures occur at various points in the network,
ECMP weights of the involved nodes are adjusted to compensate. With high
fan-out of the involved nodes, and overall high number of nodes,
a (non-)ECMP weight ratio that we would like to configure does not fit into
8 bits. Instead of, say, 255:254, we might like to configure something like
1000:999. For these deployments, the 8-bit weight may not be enough.

To that end, in this patchset increase the next hop weight from u8 to u16.

Patch #1 adds a flag that indicates whether the reserved fields are zeroed.
This is a follow-up to a new fix merged in commit 6d745cd0 ("net:
nexthop: Initialize all fields in dumped nexthops"). The theory behind this
patch is that there is a strict ordering between the fields actually being
zeroed, the kernel declaring that they are, and the kernel repurposing the
fields. Thus clients can use the flag to tell if it is safe to interpret
the reserved fields in any way.

Patch #2 contains the substantial code and the commit message covers the
details of the changes.

Patches #3 to #6 add selftests.
====================

Link: https://patch.msgid.link/cover.1723036486.git.petrm@nvidia.com


Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 246ef406 4b808f44
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -105,7 +105,7 @@ struct nh_grp_entry_stats {
struct nh_grp_entry {
	struct nexthop	*nh;
	struct nh_grp_entry_stats __percpu	*stats;
	u8		weight;
	u16		weight;

	union {
		struct {
@@ -192,7 +192,7 @@ struct nh_notifier_single_info {
};

struct nh_notifier_grp_entry_info {
	u8 weight;
	u16 weight;
	struct nh_notifier_single_info nh;
};

+9 −1
Original line number Diff line number Diff line
@@ -16,10 +16,15 @@ struct nhmsg {
struct nexthop_grp {
	__u32	id;	  /* nexthop id - must exist */
	__u8	weight;   /* weight of this nexthop */
	__u8	resvd1;
	__u8	weight_high;	/* high order bits of weight */
	__u16	resvd2;
};

static inline __u16 nexthop_grp_weight(const struct nexthop_grp *entry)
{
	return ((entry->weight_high << 8) | entry->weight) + 1;
}

enum {
	NEXTHOP_GRP_TYPE_MPATH,  /* hash-threshold nexthop group
				  * default type if not specified
@@ -33,6 +38,9 @@ enum {
#define NHA_OP_FLAG_DUMP_STATS		BIT(0)
#define NHA_OP_FLAG_DUMP_HW_STATS	BIT(1)

/* Response OP_FLAGS. */
#define NHA_OP_FLAG_RESP_GRP_RESVD_0	BIT(31)	/* Dump clears resvd fields. */

enum {
	NHA_UNSPEC,
	NHA_ID,		/* u32; id for nexthop. id == 0 means auto-assign */
+32 −17
Original line number Diff line number Diff line
@@ -865,15 +865,18 @@ static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
}

static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
			    u32 op_flags)
			    u32 op_flags, u32 *resp_op_flags)
{
	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
	struct nexthop_grp *p;
	size_t len = nhg->num_nh * sizeof(*p);
	struct nlattr *nla;
	u16 group_type = 0;
	u16 weight;
	int i;

	*resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;

	if (nhg->hash_threshold)
		group_type = NEXTHOP_GRP_TYPE_MPATH;
	else if (nhg->resilient)
@@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,

	p = nla_data(nla);
	for (i = 0; i < nhg->num_nh; ++i) {
		weight = nhg->nh_entries[i].weight - 1;

		*p++ = (struct nexthop_grp) {
			.id = nhg->nh_entries[i].nh->id,
			.weight = nhg->nh_entries[i].weight - 1,
			.weight = weight,
			.weight_high = weight >> 8,
		};
	}

@@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,

	if (nh->is_group) {
		struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
		u32 resp_op_flags = 0;

		if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
			goto nla_put_failure;
		if (nla_put_nh_group(skb, nh, op_flags))
		if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
		    nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
			goto nla_put_failure;
		goto out;
	}
@@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
	sz += nla_total_size(4); /* NHA_ID */

	if (nh->is_group)
		sz += nh_nlmsg_size_grp(nh);
		sz += nh_nlmsg_size_grp(nh) +
		      nla_total_size(4) +	/* NHA_OP_FLAGS */
		      0;
	else
		sz += nh_nlmsg_size_single(nh);

@@ -1280,11 +1290,14 @@ static int nh_check_attr_group(struct net *net,

	nhg = nla_data(tb[NHA_GROUP]);
	for (i = 0; i < len; ++i) {
		if (nhg[i].resvd1 || nhg[i].resvd2) {
			NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
		if (nhg[i].resvd2) {
			NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
			return -EINVAL;
		}
		if (nhg[i].weight > 254) {
		if (nexthop_grp_weight(&nhg[i]) == 0) {
			/* 0xffff got passed in, representing weight of 0x10000,
			 * which is too heavy.
			 */
			NL_SET_ERR_MSG(extack, "Invalid value for weight");
			return -EINVAL;
		}
@@ -1880,9 +1893,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
static void nh_res_group_rebalance(struct nh_group *nhg,
				   struct nh_res_table *res_table)
{
	int prev_upper_bound = 0;
	int total = 0;
	int w = 0;
	u16 prev_upper_bound = 0;
	u32 total = 0;
	u32 w = 0;
	int i;

	INIT_LIST_HEAD(&res_table->uw_nh_entries);
@@ -1892,11 +1905,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg,

	for (i = 0; i < nhg->num_nh; ++i) {
		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
		int upper_bound;
		u16 upper_bound;
		u64 btw;

		w += nhge->weight;
		upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
						total);
		btw = ((u64)res_table->num_nh_buckets) * w;
		upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
		nhge->res.wants_buckets = upper_bound - prev_upper_bound;
		prev_upper_bound = upper_bound;

@@ -1962,8 +1976,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg,

static void nh_hthr_group_rebalance(struct nh_group *nhg)
{
	int total = 0;
	int w = 0;
	u32 total = 0;
	u32 w = 0;
	int i;

	for (i = 0; i < nhg->num_nh; ++i)
@@ -1971,7 +1985,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg)

	for (i = 0; i < nhg->num_nh; ++i) {
		struct nh_grp_entry *nhge = &nhg->nh_entries[i];
		int upper_bound;
		u32 upper_bound;

		w += nhge->weight;
		upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
@@ -2713,7 +2727,8 @@ static struct nexthop *nexthop_create_group(struct net *net,
			goto out_no_nh;
		}
		nhg->nh_entries[i].nh = nhe;
		nhg->nh_entries[i].weight = entry[i].weight + 1;
		nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);

		list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
		nhg->nh_entries[i].nh_parent = nh;
	}
+54 −1
Original line number Diff line number Diff line
@@ -78,7 +78,12 @@ log_test()
	else
		ret=1
		nfail=$((nfail+1))
		if [[ $rc -eq $ksft_skip ]]; then
			printf "TEST: %-60s  [SKIP]\n" "${msg}"
		else
			printf "TEST: %-60s  [FAIL]\n" "${msg}"
		fi

		if [ "$VERBOSE" = "1" ]; then
			echo "    rc=$rc, expected $expected"
		fi
@@ -923,6 +928,29 @@ ipv6_grp_fcnal()

	ipv6_grp_refs
	log_test $? 0 "Nexthop group replace refcounts"

	#
	# 16-bit weights.
	#
	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
	run_cmd "$IP nexthop add id 66 dev veth1"

	run_cmd "$IP nexthop add id 103 group 62,1000"
	if [[ $? == 0 ]]; then
		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535"
		run_cmd "$IP nexthop replace $GRP"
		check_nexthop "id 103" "$GRP"
		rc=$?
	else
		rc=$ksft_skip
	fi

	$IP nexthop flush >/dev/null 2>&1

	log_test $rc 0 "16-bit weights"
}

ipv6_res_grp_fcnal()
@@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal()
	check_nexthop_bucket "list id 102" \
		"id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62"
	log_test $? 0 "Nexthop buckets updated after replace - nECMP"

	#
	# 16-bit weights.
	#
	run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
	run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
	run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
	run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
	run_cmd "$IP nexthop add id 66 dev veth1"

	run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32"
	if [[ $? == 0 ]]; then
		local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(:
			  )type resilient buckets 32 idle_timer 0 $(:
			  )unbalanced_timer 0"
		run_cmd "$IP nexthop replace $GRP"
		check_nexthop "id 103" "$GRP unbalanced_time 0"
		rc=$?
	else
		rc=$ksft_skip
	fi

	$IP nexthop flush >/dev/null 2>&1

	log_test $rc 0 "16-bit weights"
}

ipv6_fcnal_runtime()
+7 −0
Original line number Diff line number Diff line
@@ -509,6 +509,13 @@ xfail_on_slow()
	fi
}

omit_on_slow()
{
	if [[ $KSFT_MACHINE_SLOW != yes ]]; then
		"$@"
	fi
}

xfail_on_veth()
{
	local dev=$1; shift
Loading