Merge branch 'net-enetc-add-more-checks-to-enetc_set_rxfh'

Wei Fang says: ==================== net: enetc: add more checks to enetc_set_rxfh() ENETC only supports Toeplitz algorithm, and VFs do not support setting the RSS key, but enetc_set_rxfh() does not check these constraints and silently accepts unsupported configurations. This may mislead users or tools into believing that the requested RSS settings have been successfully applied. So add checks to reject unsupported hash functions and RSS key updates on VFs, and return "-EOPNOTSUPP" to user space. ==================== Link: https://patch.msgid.link/20260326075233.3628047-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net: enetc: do not allow VF to configure the RSS key
2026-04-01 22:37:41 -04:00 · 2026-03-27 20:56:49 -07:00 · 2026-03-27 20:56:46 -07:00 · 2026-03-27 20:56:45 -07:00 · 2026-03-27 20:55:52 -07:00 · 2026-03-27 20:41:11 -07:00
10 changed files with 69 additions and 17 deletions
--- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
@@ -795,9 +795,17 @@ static int enetc_set_rxfh(struct net_device *ndev,
 	struct enetc_si *si = priv->si;
 	int err = 0;

+	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
+	    rxfh->hfunc != ETH_RSS_HASH_TOP)
+		return -EOPNOTSUPP;
+
 	/* set hash key, if PF */
-	if (rxfh->key && enetc_si_is_pf(si))
+	if (rxfh->key) {
+		if (!enetc_si_is_pf(si))
+			return -EOPNOTSUPP;
+
 		enetc_set_rss_key(si, rxfh->key);
+	}

 	/* set RSS table */
 	if (rxfh->indir)
--- a/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_debugfs.c
@@ -197,7 +197,7 @@ static int fbnic_dbg_bdq_desc_seq_show(struct seq_file *s, void *v)
 		return 0;
 	}

-	for (i = 0; i <= ring->size_mask; i++) {
+	for (i = 0; i < (ring->size_mask + 1) * FBNIC_BD_FRAG_COUNT; i++) {
 		u64 bd = le64_to_cpu(ring->desc[i]);

 		seq_printf(s, "%04x %#04llx %#014llx\n", i,
--- a/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_txrx.c
@@ -927,7 +927,7 @@ static void fbnic_fill_bdq(struct fbnic_ring *bdq)
 		/* Force DMA writes to flush before writing to tail */
 		dma_wmb();

-		writel(i, bdq->doorbell);
+		writel(i * FBNIC_BD_FRAG_COUNT, bdq->doorbell);
 	}
 }

@@ -2564,7 +2564,7 @@ static void fbnic_enable_bdq(struct fbnic_ring *hpq, struct fbnic_ring *ppq)
 	hpq->tail = 0;
 	hpq->head = 0;

-	log_size = fls(hpq->size_mask);
+	log_size = fls(hpq->size_mask) + ilog2(FBNIC_BD_FRAG_COUNT);

 	/* Store descriptor ring address and size */
 	fbnic_ring_wr32(hpq, FBNIC_QUEUE_BDQ_HPQ_BAL, lower_32_bits(hpq->dma));
@@ -2576,7 +2576,7 @@ static void fbnic_enable_bdq(struct fbnic_ring *hpq, struct fbnic_ring *ppq)
 	if (!ppq->size_mask)
 		goto write_ctl;

-	log_size = fls(ppq->size_mask);
+	log_size = fls(ppq->size_mask) + ilog2(FBNIC_BD_FRAG_COUNT);

 	/* Add enabling of PPQ to BDQ control */
 	bdq_ctl |= FBNIC_QUEUE_BDQ_CTL_PPQ_ENABLE;
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -480,11 +480,16 @@ static void sfp_quirk_ubnt_uf_instant(const struct sfp_eeprom_id *id,
 {
 	/* Ubiquiti U-Fiber Instant module claims that support all transceiver
 	 * types including 10G Ethernet which is not truth. So clear all claimed
-	 * modes and set only one mode which module supports: 1000baseX_Full.
+	 * modes and set only one mode which module supports: 1000baseX_Full,
+	 * along with the Autoneg and pause bits.
 	 */
 	linkmode_zero(caps->link_modes);
 	linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT,
 			 caps->link_modes);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, caps->link_modes);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, caps->link_modes);
+	linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, caps->link_modes);
+
 	phy_interface_zero(caps->interfaces);
 	__set_bit(PHY_INTERFACE_MODE_1000BASEX, caps->interfaces);
 }
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1965,12 +1965,14 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request,
 	ns_olen = request->len - skb_network_offset(request) -
 		sizeof(struct ipv6hdr) - sizeof(*ns);
 	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
-		if (!ns->opt[i + 1]) {
+		if (!ns->opt[i + 1] || i + (ns->opt[i + 1] << 3) > ns_olen) {
 			kfree_skb(reply);
 			return NULL;
 		}
 		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
-			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+			if ((ns->opt[i + 1] << 3) >=
+			    sizeof(struct nd_opt_hdr) + ETH_ALEN)
+				daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
 			break;
 		}
 	}
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -251,12 +251,12 @@ struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg)

 static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
 		       struct sk_buff *request, struct neighbour *n,
-		       __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns)
+		       __be16 vlan_proto, u16 vlan_tci)
 {
 	struct net_device *dev = request->dev;
 	struct net_bridge_vlan_group *vg;
+	struct nd_msg *na, *ns;
 	struct sk_buff *reply;
-	struct nd_msg *na;
 	struct ipv6hdr *pip6;
 	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
 	int ns_olen;
@@ -264,7 +264,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
 	u8 *daddr;
 	u16 pvid;

-	if (!dev)
+	if (!dev || skb_linearize(request))
 		return;

 	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
@@ -281,17 +281,21 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
 	skb_set_mac_header(reply, 0);

 	daddr = eth_hdr(request)->h_source;
+	ns = (struct nd_msg *)(skb_network_header(request) +
+			       sizeof(struct ipv6hdr));

 	/* Do we need option processing ? */
 	ns_olen = request->len - (skb_network_offset(request) +
 				  sizeof(struct ipv6hdr)) - sizeof(*ns);
 	for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
-		if (!ns->opt[i + 1]) {
+		if (!ns->opt[i + 1] || i + (ns->opt[i + 1] << 3) > ns_olen) {
 			kfree_skb(reply);
 			return;
 		}
 		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
-			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
+			if ((ns->opt[i + 1] << 3) >=
+			    sizeof(struct nd_opt_hdr) + ETH_ALEN)
+				daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
 			break;
 		}
 	}
@@ -472,9 +476,9 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
 				if (vid != 0)
 					br_nd_send(br, p, skb, n,
 						   skb->vlan_proto,
-						   skb_vlan_tag_get(skb), msg);
+						   skb_vlan_tag_get(skb));
 				else
-					br_nd_send(br, p, skb, n, 0, 0, msg);
+					br_nd_send(br, p, skb, n, 0, 0);
 				replied = true;
 			}

--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -875,6 +875,9 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 	if (!skb2)
 		return 1;

+	/* Remove debris left by IPv4 stack. */
+	memset(IP6CB(skb2), 0, sizeof(*IP6CB(skb2)));
+
 	skb_dst_drop(skb2);
 	skb_pull(skb2, nhs);
 	skb_reset_network_header(skb2);
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -601,11 +601,16 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (!skb2)
 		return 0;

+	/* Remove debris left by IPv6 stack. */
+	memset(IPCB(skb2), 0, sizeof(*IPCB(skb2)));
+
 	skb_dst_drop(skb2);

 	skb_pull(skb2, offset);
 	skb_reset_network_header(skb2);
 	eiph = ip_hdr(skb2);
+	if (eiph->version != 4 || eiph->ihl < 5)
+		goto out;

 	/* Try to guess incoming interface */
 	rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr,
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -555,7 +555,7 @@ static void
 rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 {
 	u64 y1, y2, dx, dy;
-	u32 dsm;
+	u64 dsm;

 	if (isc->sm1 <= isc->sm2) {
 		/* service curve is convex */
@@ -598,7 +598,7 @@ rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y)
 	 */
 	dx = (y1 - y) << SM_SHIFT;
 	dsm = isc->sm1 - isc->sm2;
-	do_div(dx, dsm);
+	dx = div64_u64(dx, dsm);
 	/*
 	 * check if (x, y1) belongs to the 1st segment of rtsc.
 	 * if so, add the offset.
--- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json
@@ -1111,5 +1111,30 @@
        "teardown": [
            "$TC qdisc del dev $DUMMY root handle 1:"
        ]
+    },
+    {
+        "id": "a3d7",
+        "name": "HFSC with large m1 - no divide-by-zero on class reactivation",
+        "category": [
+            "qdisc",
+            "hfsc"
+        ],
+        "plugins": {
+            "requires": "nsPlugin"
+        },
+        "setup": [
+            "$TC qdisc replace dev $DUMMY root handle 1: hfsc default 1",
+            "$TC class replace dev $DUMMY parent 1: classid 1:1 hfsc rt m1 32gbit d 1ms m2 0bit ls m1 32gbit d 1ms m2 0bit",
+            "ping -I$DUMMY -f -c1 -s64 -W1 10.10.10.1 || true",
+            "sleep 1"
+        ],
+        "cmdUnderTest": "ping -I$DUMMY -f -c1 -s64 -W1 10.10.10.1 || true",
+        "expExitCode": "0",
+        "verifyCmd": "$TC qdisc show dev $DUMMY",
+        "matchPattern": "qdisc hfsc 1: root",
+        "matchCount": "1",
+        "teardown": [
+            "$TC qdisc del dev $DUMMY handle 1: root"
+        ]
    }
 ]
Author	SHA1	Message	Date
Jakub Kicinski	dc9e9d61e3	Merge branch 'net-enetc-add-more-checks-to-enetc_set_rxfh' Wei Fang says: ==================== net: enetc: add more checks to enetc_set_rxfh() ENETC only supports Toeplitz algorithm, and VFs do not support setting the RSS key, but enetc_set_rxfh() does not check these constraints and silently accepts unsupported configurations. This may mislead users or tools into believing that the requested RSS settings have been successfully applied. So add checks to reject unsupported hash functions and RSS key updates on VFs, and return "-EOPNOTSUPP" to user space. ==================== Link: https://patch.msgid.link/20260326075233.3628047-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:56:49 -07:00
Wei Fang	a142d13916	net: enetc: do not allow VF to configure the RSS key VFs do not have privilege to configure the RSS key because the registers are owned by the PF. Currently, if VF attempts to configure the RSS key, enetc_set_rxfh() simply skips the configuration and does not generate a warning, which may mislead users into thinking the feature is supported. To improve this situation, add a check to reject RSS key configuration on VFs. Fixes: `d382563f54` ("enetc: Add RFS and RSS support") Signed-off-by: Wei Fang <wei.fang@nxp.com> Reviewed-by: Clark Wang <xiaoning.wang@nxp.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com> Link: https://patch.msgid.link/20260326075233.3628047-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:56:46 -07:00
Wei Fang	d389954a6c	net: enetc: check whether the RSS algorithm is Toeplitz Both ENETC v1 and v4 only provide Toeplitz RSS support. This patch adds a validation check to reject attempts to configure other RSS algorithms, avoiding misleading configuration options for users. Fixes: `d382563f54` ("enetc: Add RFS and RSS support") Signed-off-by: Wei Fang <wei.fang@nxp.com> Reviewed-by: Clark Wang <xiaoning.wang@nxp.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com> Link: https://patch.msgid.link/20260326075233.3628047-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:56:45 -07:00
Marek Behún	eeee5a710f	net: sfp: Fix Ubiquiti U-Fiber Instant SFP module on mvneta In commit `8110633db4` ("net: sfp-bus: allow SFP quirks to override Autoneg and pause bits") we moved the setting of Autoneg and pause bits before the call to SFP quirk when parsing SFP module support. Since the quirk for Ubiquiti U-Fiber Instant SFP module zeroes the support bits and sets 1000baseX_Full only, the above mentioned commit changed the overall computed support from 1000baseX_Full, Autoneg, Pause, Asym_Pause to just 1000baseX_Full. This broke the SFP module for mvneta, which requires Autoneg for 1000baseX since commit `c762b7fac1` ("net: mvneta: deny disabling autoneg for 802.3z modes"). Fix this by setting back the Autoneg, Pause and Asym_Pause bits in the quirk. Fixes: `8110633db4` ("net: sfp-bus: allow SFP quirks to override Autoneg and pause bits") Signed-off-by: Marek Behún <kabel@kernel.org> Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk> Link: https://patch.msgid.link/20260326122038.2489589-1-kabel@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:55:52 -07:00
Xiang Mei	5d17af9eb2	selftests/tc-testing: add test for HFSC divide-by-zero in rtsc_min() Add a regression test for the divide-by-zero in rtsc_min() triggered when m2sm() converts a large m1 value (e.g. 32gbit) to a u64 scaled slope reaching 2^32. rtsc_min() stores the difference of two such u64 values (sm1 - sm2) in a u32 variable `dsm`, truncating 2^32 to zero and causing a divide-by-zero oops in the concave-curve intersection path. The test configures an HFSC class with m1=32gbit d=1ms m2=0bit, sends a packet to activate the class, waits for it to drain and go idle, then sends another packet to trigger reactivation through rtsc_min(). Signed-off-by: Xiang Mei <xmei5@asu.edu> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Reviewed-by: Victor Nogueira <victor@mojatatu.com> Link: https://patch.msgid.link/20260326204310.1549327-2-xmei5@asu.edu Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:41:11 -07:00
Xiang Mei	4576100b8c	net/sched: sch_hfsc: fix divide-by-zero in rtsc_min() m2sm() converts a u32 slope to a u64 scaled value. For large inputs (e.g. m1=4000000000), the result can reach 2^32. rtsc_min() stores the difference of two such u64 values in a u32 variable `dsm` and uses it as a divisor. When the difference is exactly 2^32 the truncation yields zero, causing a divide-by-zero oops in the concave-curve intersection path: Oops: divide error: 0000 RIP: 0010:rtsc_min (net/sched/sch_hfsc.c:601) Call Trace: init_ed (net/sched/sch_hfsc.c:629) hfsc_enqueue (net/sched/sch_hfsc.c:1569) [...] Widen `dsm` to u64 and replace do_div() with div64_u64() so the full difference is preserved. Fixes: `1da177e4c3` ("Linux-2.6.12-rc2") Reported-by: Weiming Shi <bestswngs@gmail.com> Signed-off-by: Xiang Mei <xmei5@asu.edu> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Link: https://patch.msgid.link/20260326204310.1549327-1-xmei5@asu.edu Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:41:11 -07:00
Jakub Kicinski	1a6fdb3518	Merge branch 'bridge-vxlan-harden-nd-option-parsing-paths' Yang Yang says: ==================== bridge/vxlan: harden ND option parsing paths This series hardens ND option parsing in bridge and vxlan paths. Patch 1 linearizes the request skb in br_nd_send() before walking ND options. Patch 2 adds explicit ND option length validation in br_nd_send(). Patch 3 adds matching ND option length validation in vxlan_na_create(). ==================== Link: https://patch.msgid.link/20260326034441.2037420-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:37:17 -07:00
Yang Yang	afa9a05e6c	vxlan: validate ND option lengths in vxlan_na_create vxlan_na_create() walks ND options according to option-provided lengths. A malformed option can make the parser advance beyond the computed option span or use a too-short source LLADDR option payload. Validate option lengths against the remaining NS option area before advancing, and only read source LLADDR when the option is large enough for an Ethernet address. Fixes: `4b29dba9c0` ("vxlan: fix nonfunctional neigh_reduce()") Cc: stable@vger.kernel.org Reported-by: Yifan Wu <yifanwucs@gmail.com> Reported-by: Juefei Pu <tomapufckgml@gmail.com> Tested-by: Ao Zhou <n05ec@lzu.edu.cn> Co-developed-by: Yuan Tan <tanyuan98@outlook.com> Signed-off-by: Yuan Tan <tanyuan98@outlook.com> Suggested-by: Xin Liu <bird@lzu.edu.cn> Signed-off-by: Yang Yang <n05ec@lzu.edu.cn> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Link: https://patch.msgid.link/20260326034441.2037420-4-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:37:14 -07:00
Yang Yang	850837965a	bridge: br_nd_send: validate ND option lengths br_nd_send() walks ND options according to option-provided lengths. A malformed option can make the parser advance beyond the computed option span or use a too-short source LLADDR option payload. Validate option lengths against the remaining NS option area before advancing, and only read source LLADDR when the option is large enough for an Ethernet address. Fixes: `ed842faeb2` ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Cc: stable@vger.kernel.org Reported-by: Yifan Wu <yifanwucs@gmail.com> Reported-by: Juefei Pu <tomapufckgml@gmail.com> Tested-by: Ao Zhou <n05ec@lzu.edu.cn> Co-developed-by: Yuan Tan <tanyuan98@outlook.com> Signed-off-by: Yuan Tan <tanyuan98@outlook.com> Suggested-by: Xin Liu <bird@lzu.edu.cn> Signed-off-by: Yang Yang <n05ec@lzu.edu.cn> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Link: https://patch.msgid.link/20260326034441.2037420-3-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:37:14 -07:00
Yang Yang	a01aee7caf	bridge: br_nd_send: linearize skb before parsing ND options br_nd_send() parses neighbour discovery options from ns->opt[] and assumes that these options are in the linear part of request. Its callers only guarantee that the ICMPv6 header and target address are available, so the option area can still be non-linear. Parsing ns->opt[] in that case can access data past the linear buffer. Linearize request before option parsing and derive ns from the linear network header. Fixes: `ed842faeb2` ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Reported-by: Yifan Wu <yifanwucs@gmail.com> Reported-by: Juefei Pu <tomapufckgml@gmail.com> Tested-by: Ao Zhou <n05ec@lzu.edu.cn> Co-developed-by: Yuan Tan <tanyuan98@outlook.com> Signed-off-by: Yuan Tan <tanyuan98@outlook.com> Suggested-by: Xin Liu <bird@lzu.edu.cn> Signed-off-by: Yang Yang <n05ec@lzu.edu.cn> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Acked-by: Nikolay Aleksandrov <razor@blackwall.org> Link: https://patch.msgid.link/20260326034441.2037420-2-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:37:14 -07:00
Jakub Kicinski	c11c731a68	Merge branch 'fix-page-fragment-handling-when-page_size-4k' Dimitri Daskalakis says: ==================== Fix page fragment handling when PAGE_SIZE > 4K FBNIC operates on fixed size descriptors (4K). When the OS supports pages larger than 4K, we fragment the page across multiple descriptors. While performance testing, I found several issues with our page fragment handling, resulting in low throughput and potential RX stalls. ==================== Link: https://patch.msgid.link/20260324195123.3486219-1-dimitri.daskalakis1@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:30:26 -07:00
Dimitri Daskalakis	f3567dd428	eth: fbnic: Fix debugfs output for BDQ's with page frags The rings size_mask represents the number of pages, so we need to determine the number of page frags when dumping the descriptors. Fixes: `df04373b0d` ("eth fbnic: Add debugfs hooks for tx/rx rings") Signed-off-by: Dimitri Daskalakis <daskald@meta.com> Link: https://patch.msgid.link/20260324195123.3486219-3-dimitri.daskalakis1@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:30:23 -07:00
Dimitri Daskalakis	b38c55320b	eth: fbnic: Account for page fragments when updating BDQ tail FBNIC supports fixed size buffers of 4K. When PAGE_SIZE > 4K, we fragment the page across multiple descriptors (FBNIC_BD_FRAG_COUNT). When refilling the BDQ, the correct number of entries are populated, but tail was only incremented by one. So on a system with 64K pages, HW would get one descriptor refilled for every 16 we populate. Additionally, we program the ring size in the HW when enabling the BDQ. This was not accounting for page fragments, so on systems with 64K pages, the HW used 1/16th of the ring. Fixes: `0cb4c0a137` ("eth: fbnic: Implement Rx queue alloc/start/stop/free") Signed-off-by: Dimitri Daskalakis <daskald@meta.com> Link: https://patch.msgid.link/20260324195123.3486219-2-dimitri.daskalakis1@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:30:23 -07:00
Eric Dumazet	2edfa31769	ip6_tunnel: clear skb2->cb[] in ip4ip6_err() Oskar Kjos reported the following problem. ip4ip6_err() calls icmp_send() on a cloned skb whose cb[] was written by the IPv6 receive path as struct inet6_skb_parm. icmp_send() passes IPCB(skb2) to __ip_options_echo(), which interprets that cb[] region as struct inet_skb_parm (IPv4). The layouts differ: inet6_skb_parm.nhoff at offset 14 overlaps inet_skb_parm.opt.rr, producing a non-zero rr value. __ip_options_echo() then reads optlen from attacker-controlled packet data at sptr[rr+1] and copies that many bytes into dopt->__data, a fixed 40-byte stack buffer (IP_OPTIONS_DATA_FIXED_SIZE). To fix this we clear skb2->cb[], as suggested by Oskar Kjos. Also add minimal IPv4 header validation (version == 4, ihl >= 5). Fixes: `c4d3efafcc` ("[IPV6] IP6TUNNEL: Add support to IPv4 over IPv6 tunnel.") Reported-by: Oskar Kjos <oskar.kjos@hotmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Link: https://patch.msgid.link/20260326155138.2429480-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:24:12 -07:00
Eric Dumazet	86ab3e5567	ipv6: icmp: clear skb2->cb[] in ip6_err_gen_icmpv6_unreach() Sashiko AI-review observed: In ip6_err_gen_icmpv6_unreach(), the skb is an outer IPv4 ICMP error packet where its cb contains an IPv4 inet_skb_parm. When skb is cloned into skb2 and passed to icmp6_send(), it uses IP6CB(skb2). IP6CB interprets the IPv4 inet_skb_parm as an inet6_skb_parm. The cipso offset in inet_skb_parm.opt directly overlaps with dsthao in inet6_skb_parm at offset 18. If an attacker sends a forged ICMPv4 error with a CIPSO IP option, dsthao would be a non-zero offset. Inside icmp6_send(), mip6_addr_swap() is called and uses ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO). This would scan the inner, attacker-controlled IPv6 packet starting at that offset, potentially returning a fake TLV without checking if the remaining packet length can hold the full 18-byte struct ipv6_destopt_hao. Could mip6_addr_swap() then perform a 16-byte swap that extends past the end of the packet data into skb_shared_info? Should the cb array also be cleared in ip6_err_gen_icmpv6_unreach() and ip6ip6_err() to prevent this? This patch implements the first suggestion. I am not sure if ip6ip6_err() needs to be changed. A separate patch would be better anyway. Fixes: `ca15a078bd` ("sit: generate icmpv6 error when receiving icmpv4 error") Reported-by: Ido Schimmel <idosch@nvidia.com> Closes: https://sashiko.dev/#/patchset/20260326155138.2429480-1-edumazet%40google.com Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Oskar Kjos <oskar.kjos@hotmail.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Link: https://patch.msgid.link/20260326202608.2976021-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>	2026-03-27 20:22:46 -07:00