Commit 96c3490d authored by Miao Xu's avatar Miao Xu Committed by Martin KaFai Lau
Browse files

selftests/bpf: Add test for the use of new args in cong_control



This patch adds a selftest to show the usage of the new arguments in
cong_control. For simplicity's sake, the testing example reuses cubic's
kernel functions.

Signed-off-by: default avatarMiao Xu <miaxu@meta.com>
Link: https://lore.kernel.org/r/20240502042318.801932-4-miaxu@meta.com


Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parent 0325cbd2
Loading
Loading
Loading
Loading
+24 −0
Original line number Diff line number Diff line
@@ -14,6 +14,7 @@
#include "tcp_ca_incompl_cong_ops.skel.h"
#include "tcp_ca_unsupp_cong_op.skel.h"
#include "tcp_ca_kfunc.skel.h"
#include "bpf_cc_cubic.skel.h"

#ifndef ENOTSUPP
#define ENOTSUPP 524
@@ -452,6 +453,27 @@ static void test_tcp_ca_kfunc(void)
	tcp_ca_kfunc__destroy(skel);
}

static void test_cc_cubic(void)
{
	struct bpf_cc_cubic *cc_cubic_skel;
	struct bpf_link *link;

	cc_cubic_skel = bpf_cc_cubic__open_and_load();
	if (!ASSERT_OK_PTR(cc_cubic_skel, "bpf_cc_cubic__open_and_load"))
		return;

	link = bpf_map__attach_struct_ops(cc_cubic_skel->maps.cc_cubic);
	if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) {
		bpf_cc_cubic__destroy(cc_cubic_skel);
		return;
	}

	do_test("bpf_cc_cubic", NULL);

	bpf_link__destroy(link);
	bpf_cc_cubic__destroy(cc_cubic_skel);
}

void test_bpf_tcp_ca(void)
{
	if (test__start_subtest("dctcp"))
@@ -482,4 +504,6 @@ void test_bpf_tcp_ca(void)
		test_link_replace();
	if (test__start_subtest("tcp_ca_kfunc"))
		test_tcp_ca_kfunc();
	if (test__start_subtest("cc_cubic"))
		test_cc_cubic();
}
+199 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only

/* Highlights:
 * 1. The major difference between this bpf program and tcp_cubic.c
 *    is that this bpf program relies on `cong_control` rather than
 *    `cong_avoid` in the struct tcp_congestion_ops.
 * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
 *    tcp_update_pacing_rate is bypassed when `cong_control` is
 *    defined, so moving these logic to `cong_control`.
 * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
 *    The main purpose is to show use cases of the arguments in
 *    `cong_control`. For simplicity's sake, it reuses tcp cubic's
 *    kernel functions.
 */

#include "vmlinux.h"

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_tracing_net.h"

#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, args)

#define USEC_PER_SEC 1000000UL
#define TCP_PACING_SS_RATIO (200)
#define TCP_PACING_CA_RATIO (120)
#define TCP_REORDERING (12)

#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define after(seq2, seq1) before(seq1, seq2)

extern void cubictcp_init(struct sock *sk) __ksym;
extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;

static struct inet_connection_sock *inet_csk(const struct sock *sk)
{
	return (struct inet_connection_sock *)sk;
}

static struct tcp_sock *tcp_sk(const struct sock *sk)
{
	return (struct tcp_sock *)sk;
}

static bool before(__u32 seq1, __u32 seq2)
{
	return (__s32)(seq1-seq2) < 0;
}

static __u64 div64_u64(__u64 dividend, __u64 divisor)
{
	return dividend / divisor;
}

static void tcp_update_pacing_rate(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	__u64 rate;

	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
	rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);

	/* current rate is (cwnd * mss) / srtt
	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
	 * In Congestion Avoidance phase, set it to 120 % the current rate.
	 *
	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
	 *	 end of slow start and should slow down.
	 */
	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
		rate *= TCP_PACING_SS_RATIO;
	else
		rate *= TCP_PACING_CA_RATIO;

	rate *= max(tp->snd_cwnd, tp->packets_out);

	if (tp->srtt_us)
		rate = div64_u64(rate, (__u64)tp->srtt_us);

	sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
}

static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
			       int newly_lost, int flag)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int sndcnt = 0;
	__u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
	int delta = tp->snd_ssthresh - pkts_in_flight;

	if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
		return;

	__u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;

	if (delta < 0) {
		__u64 dividend =
			(__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
		sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
	} else {
		sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
		if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
			sndcnt++;
		sndcnt = min(delta, sndcnt);
	}
	/* Force a fast retransmit upon entering fast recovery */
	sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
	tp->snd_cwnd = pkts_in_flight + sndcnt;
}

/* Decide wheather to run the increase function of congestion control. */
static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
	if (tcp_sk(sk)->reordering > TCP_REORDERING)
		return flag & FLAG_FORWARD_PROGRESS;

	return flag & FLAG_DATA_ACKED;
}

void BPF_STRUCT_OPS(bpf_cubic_init, struct sock *sk)
{
	cubictcp_init(sk);
}

void BPF_STRUCT_OPS(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
{
	cubictcp_cwnd_event(sk, event);
}

void BPF_STRUCT_OPS(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
		    const struct rate_sample *rs)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
			(1 << inet_csk(sk)->icsk_ca_state)) {
		/* Reduce cwnd if state mandates */
		tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);

		if (!before(tp->snd_una, tp->high_seq)) {
			/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
			if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
					inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
				tp->snd_cwnd = tp->snd_ssthresh;
				tp->snd_cwnd_stamp = tcp_jiffies32;
			}
		}
	} else if (tcp_may_raise_cwnd(sk, flag)) {
		/* Advance cwnd if state allows */
		cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
		tp->snd_cwnd_stamp = tcp_jiffies32;
	}

	tcp_update_pacing_rate(sk);
}

__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk)
{
	return cubictcp_recalc_ssthresh(sk);
}

void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state)
{
	cubictcp_state(sk, new_state);
}

void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk,
		const struct ack_sample *sample)
{
	cubictcp_acked(sk, sample);
}

__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk)
{
	return tcp_reno_undo_cwnd(sk);
}

SEC(".struct_ops")
struct tcp_congestion_ops cc_cubic = {
	.init		= (void *)bpf_cubic_init,
	.ssthresh	= (void *)bpf_cubic_recalc_ssthresh,
	.cong_control	= (void *)bpf_cubic_cong_control,
	.set_state	= (void *)bpf_cubic_state,
	.undo_cwnd	= (void *)bpf_cubic_undo_cwnd,
	.cwnd_event	= (void *)bpf_cubic_cwnd_event,
	.pkts_acked     = (void *)bpf_cubic_acked,
	.name		= "bpf_cc_cubic",
};

char _license[] SEC("license") = "GPL";
+10 −0
Original line number Diff line number Diff line
@@ -80,6 +80,14 @@
#define TCP_INFINITE_SSTHRESH	0x7fffffff
#define TCP_PINGPONG_THRESH	3

#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data.		*/
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED 0x20 /* New SACK.				*/
#define FLAG_SND_UNA_ADVANCED \
	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED)

#define fib_nh_dev		nh_common.nhc_dev
#define fib_nh_gw_family	nh_common.nhc_gw_family
#define fib_nh_gw6		nh_common.nhc_gw.ipv6
@@ -119,4 +127,6 @@
#define tw_v6_daddr		__tw_common.skc_v6_daddr
#define tw_v6_rcv_saddr		__tw_common.skc_v6_rcv_saddr

#define tcp_jiffies32 ((__u32)bpf_jiffies64())

#endif