Commit 0f08f0b0 authored by Florian Fuchs's avatar Florian Fuchs Committed by Paolo Abeni
Browse files

net: ps3_gelic_net: handle skb allocation failures



Handle skb allocation failures in RX path, to avoid NULL pointer
dereference and RX stalls under memory pressure. If the refill fails
with -ENOMEM, complete napi polling and wake up later to retry via timer.
Also explicitly re-enable RX DMA after oom, so the dmac doesn't remain
stopped in this situation.

Previously, memory pressure could lead to skb allocation failures and
subsequent Oops like:

	Oops: Kernel access of bad area, sig: 11 [#2]
	Hardware name: SonyPS3 Cell Broadband Engine 0x701000 PS3
	NIP [c0003d0000065900] gelic_net_poll+0x6c/0x2d0 [ps3_gelic] (unreliable)
	LR [c0003d00000659c4] gelic_net_poll+0x130/0x2d0 [ps3_gelic]
	Call Trace:
	  gelic_net_poll+0x130/0x2d0 [ps3_gelic] (unreliable)
	  __napi_poll+0x44/0x168
	  net_rx_action+0x178/0x290

Steps to reproduce the issue:
	1. Start a continuous network traffic, like scp of a 20GB file
	2. Inject failslab errors using the kernel fault injection:
	    echo -1 > /sys/kernel/debug/failslab/times
	    echo 30 > /sys/kernel/debug/failslab/interval
	    echo 100 > /sys/kernel/debug/failslab/probability
	3. After some time, traces start to appear, kernel Oopses
	   and the system stops

Step 2 is not always necessary, as it is usually already triggered by
the transfer of a big enough file.

Fixes: 02c18891 ("ps3: gigabit ethernet driver for PS3, take3")
Signed-off-by: default avatarFlorian Fuchs <fuchsfl@gmail.com>
Link: https://patch.msgid.link/20251113181000.3914980-1-fuchsfl@gmail.com


Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parent 896f1a24
Loading
Loading
Loading
Loading
+34 −11
Original line number Diff line number Diff line
@@ -260,6 +260,7 @@ void gelic_card_down(struct gelic_card *card)
	if (atomic_dec_if_positive(&card->users) == 0) {
		pr_debug("%s: real do\n", __func__);
		napi_disable(&card->napi);
		timer_delete_sync(&card->rx_oom_timer);
		/*
		 * Disable irq. Wireless interrupts will
		 * be disabled later if any
@@ -970,7 +971,8 @@ static void gelic_net_pass_skb_up(struct gelic_descr *descr,
 * gelic_card_decode_one_descr - processes an rx descriptor
 * @card: card structure
 *
 * returns 1 if a packet has been sent to the stack, otherwise 0
 * returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc
 * failure, otherwise 0
 *
 * processes an rx descriptor by iommu-unmapping the data buffer and passing
 * the packet up to the stack
@@ -981,16 +983,18 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
	struct gelic_descr_chain *chain = &card->rx_chain;
	struct gelic_descr *descr = chain->head;
	struct net_device *netdev = NULL;
	int dmac_chain_ended;
	int dmac_chain_ended = 0;
	int prepare_rx_ret;

	status = gelic_descr_get_status(descr);

	if (status == GELIC_DESCR_DMA_CARDOWNED)
		return 0;

	if (status == GELIC_DESCR_DMA_NOT_IN_USE) {
	if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) {
		dev_dbg(ctodev(card), "dormant descr? %p\n", descr);
		return 0;
		dmac_chain_ended = 1;
		goto refill;
	}

	/* netdevice select */
@@ -1048,6 +1052,7 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
refill:

	/* is the current descriptor terminated with next_descr == NULL? */
	if (!dmac_chain_ended)
		dmac_chain_ended =
			be32_to_cpu(descr->hw_regs.dmac_cmd_status) &
			GELIC_DESCR_RX_DMA_CHAIN_END;
@@ -1062,10 +1067,11 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
	gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE);

	/*
	 * this call can fail, but for now, just leave this
	 * descriptor without skb
	 * this call can fail, propagate the error
	 */
	gelic_descr_prepare_rx(card, descr);
	prepare_rx_ret = gelic_descr_prepare_rx(card, descr);
	if (prepare_rx_ret)
		return prepare_rx_ret;

	chain->tail = descr;
	chain->head = descr->next;
@@ -1087,6 +1093,13 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
	return 1;
}

static void gelic_rx_oom_timer(struct timer_list *t)
{
	struct gelic_card *card = timer_container_of(card, t, rx_oom_timer);

	napi_schedule(&card->napi);
}

/**
 * gelic_net_poll - NAPI poll function called by the stack to return packets
 * @napi: napi structure
@@ -1099,14 +1112,22 @@ static int gelic_net_poll(struct napi_struct *napi, int budget)
{
	struct gelic_card *card = container_of(napi, struct gelic_card, napi);
	int packets_done = 0;
	int work_result = 0;

	while (packets_done < budget) {
		if (!gelic_card_decode_one_descr(card))
		work_result = gelic_card_decode_one_descr(card);
		if (work_result != 1)
			break;

		packets_done++;
	}

	if (work_result == -ENOMEM) {
		napi_complete_done(napi, packets_done);
		mod_timer(&card->rx_oom_timer, jiffies + 1);
		return packets_done;
	}

	if (packets_done < budget) {
		napi_complete_done(napi, packets_done);
		gelic_card_rx_irq_on(card);
@@ -1576,6 +1597,8 @@ static struct gelic_card *gelic_alloc_card_net(struct net_device **netdev)
	mutex_init(&card->updown_lock);
	atomic_set(&card->users, 0);

	timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0);

	return card;
}

+1 −0
Original line number Diff line number Diff line
@@ -268,6 +268,7 @@ struct gelic_vlan_id {
struct gelic_card {
	struct napi_struct napi;
	struct net_device *netdev[GELIC_PORT_MAX];
	struct timer_list rx_oom_timer;
	/*
	 * hypervisor requires irq_status should be
	 * 8 bytes aligned, but u64 member is