Commit 54377fca authored by KaFai Wan's avatar KaFai Wan Committed by Martin KaFai Lau
Browse files

bpf: Reject TCP_NODELAY in bpf-tcp-cc



A BPF TCP congestion control program can call bpf_setsockopt() from
its callbacks. In current kernels, if it calls
bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can
re-enter the TCP transmit path before the outer tcp_transmit_skb()
has completed and advanced the send head.

This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion:

  tcp_transmit_skb()
    -> tcp_event_data_sent()
      -> tcp_ca_event(sk, CA_EVENT_TX_START)
        -> cwnd_event_tx_start()
          -> bpf_setsockopt(TCP_NODELAY)
            -> tcp_push_pending_frames()
              -> tcp_write_xmit()
                -> tcp_transmit_skb()

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing
a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP
congestion control programs. To keep it simple, all tcp-cc ops is
rejected for TCP_NODELAY.

Fixes: 7e41df5d ("bpf: Add a few optnames to bpf_setsockopt")
Suggested-by: default avatarMartin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: default avatarKaFai Wan <kafai.wan@linux.dev>
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: default avatarJiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev
parent 846c76ec
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
extern const struct bpf_func_proto bpf_find_vma_proto;
+24 −0
Original line number Diff line number Diff line
@@ -5688,6 +5688,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
	.arg5_type	= ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
	   int, optname, char *, optval, int, optlen)
{
	/*
	 * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
	 * CA_EVENT_TX_START in bpf_tcp_cc.
	 */
	if (level == SOL_TCP && optname == TCP_NODELAY)
		return -EOPNOTSUPP;

	return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
	.func		= bpf_sk_setsockopt_nodelay,
	.gpl_only	= false,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
	.arg2_type	= ARG_ANYTHING,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
	.arg5_type	= ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
	   int, optname, char *, optval, int, optlen)
{
+1 −1
Original line number Diff line number Diff line
@@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
		 */
		if (prog_ops_moff(prog) !=
		    offsetof(struct tcp_congestion_ops, release))
			return &bpf_sk_setsockopt_proto;
			return &bpf_sk_setsockopt_nodelay_proto;
		return NULL;
	case BPF_FUNC_getsockopt:
		/* Since get/setsockopt is usually expected to