Commit 6be02e3e authored by Christian Hopps's avatar Christian Hopps Committed by Steffen Klassert
Browse files

xfrm: iptfs: handle reordering of received packets



Handle the receipt of the outer tunnel packets out-of-order. Pointers to
the out-of-order packets are saved in a window (array) awaiting needed
prior packets. When the required prior packets are received the now
in-order packets are then passed on to the regular packet receive code.
A timer is used to consider missing earlier packet as lost so the
algorithm will advance.

Signed-off-by: default avatarChristian Hopps <chopps@labn.net>
Tested-by: default avatarAntony Antony <antony.antony@secunet.com>
Signed-off-by: default avatarSteffen Klassert <steffen.klassert@secunet.com>
parent 5f2b6a90
Loading
Loading
Loading
Loading
+484 −13
Original line number Diff line number Diff line
@@ -39,6 +39,17 @@
 */
#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000

/**
 * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size
 *
 * The default IPTFS reorder window size. The reorder window size dictates the
 * maximum number of IPTFS tunnel packets in a sequence that may arrive out of
 * order.
 *
 * Default 3. (tcp folks suggested)
 */
#define IPTFS_DEFAULT_REORDER_WINDOW 3

/* ------------------------------------------------ */
/* IPTFS default SA values (tunnel ingress/dir-out) */
/* ------------------------------------------------ */
@@ -95,14 +106,22 @@
 * @max_queue_size: The maximum number of octets allowed to be queued to be sent
 *	over the IPTFS SA. The queue size is measured as the size of all the
 *	packets enqueued.
 * @reorder_win_size: the number slots in the reorder window, thus the number of
 *	packets that may arrive out of order.
 * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
 */
struct xfrm_iptfs_config {
	u32 pkt_size;	    /* outer_packet_size or 0 */
	u32 max_queue_size; /* octets */
	u16 reorder_win_size;
	u8 dont_frag : 1;
};

struct skb_wseq {
	struct sk_buff *skb;
	u64 drop_time;
};

/**
 * struct xfrm_iptfs_data - mode specific xfrm state.
 * @cfg: IPTFS tunnel config.
@@ -113,6 +132,10 @@ struct xfrm_iptfs_config {
 * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet.
 * @iptfs_timer: output timer.
 * @payload_mtu: max payload size.
 * @w_seq_set: true after first seq received.
 * @w_wantseq: waiting for this seq number as next to process (in order).
 * @w_saved: the saved buf array (reorder window).
 * @w_savedlen: the saved len (not size).
 * @drop_lock: lock to protect reorder queue.
 * @drop_timer: timer for considering next packet lost.
 * @drop_time_ns: timer intervan in nanoseconds.
@@ -134,12 +157,16 @@ struct xfrm_iptfs_data {
	struct hrtimer iptfs_timer; /* output timer */
	u32 payload_mtu;	    /* max payload size */

	/* Tunnel egress */
	/* Tunnel input reordering */
	bool w_seq_set;		  /* true after first seq received */
	u64 w_wantseq;		  /* expected next sequence */
	struct skb_wseq *w_saved; /* the saved buf array */
	u32 w_savedlen;		  /* the saved len (not size) */
	spinlock_t drop_lock;
	struct hrtimer drop_timer;
	u64 drop_time_ns;

	/* Tunnel egress reassembly */
	/* Tunnel input reassembly */
	struct sk_buff *ra_newskb; /* new pkt being reassembled */
	u64 ra_wantseq;		   /* expected next sequence */
	u8 ra_runt[6];		   /* last pkt bytes from last skb */
@@ -1123,15 +1150,13 @@ static bool __input_process_payload(struct xfrm_state *x, u32 data,
}

/**
 * iptfs_input() - handle receipt of iptfs payload
 * iptfs_input_ordered() - handle next in order IPTFS payload.
 * @x: xfrm state
 * @skb: the packet
 * @skb: current packet
 *
 * Process the IPTFS payload in `skb` and consume it afterwards.
 *
 * Returns 0.
 */
static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
{
	struct ip_iptfs_cc_hdr iptcch;
	struct skb_seq_state skbseq;
@@ -1206,12 +1231,355 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
		skb_abort_seq_read(&skbseq);
		kfree_skb(skb);
	}
}

	/* We always have dealt with the input SKB, either we are re-using it,
	 * or we have freed it. Return EINPROGRESS so that xfrm_input stops
	 * processing it.
/* ------------------------------- */
/* Input (Egress) Re-ordering Code */
/* ------------------------------- */

static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift)
{
	u32 savedlen = xtfs->w_savedlen;

	if (shift > savedlen)
		shift = savedlen;
	if (shift != savedlen)
		memcpy(xtfs->w_saved, xtfs->w_saved + shift,
		       (savedlen - shift) * sizeof(*xtfs->w_saved));
	memset(xtfs->w_saved + savedlen - shift, 0,
	       shift * sizeof(*xtfs->w_saved));
	xtfs->w_savedlen -= shift;
}

static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
			   struct list_head *freelist)
{
	list_add_tail(&inskb->list, freelist);
}

static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)

{
	struct skb_wseq *s, *se;
	const u32 savedlen = xtfs->w_savedlen;
	time64_t now = ktime_get_raw_fast_ns();
	u32 count = 0;
	u32 scount = 0;

	if (xtfs->w_saved[0].drop_time > now)
		goto set_timer;

	++xtfs->w_wantseq;

	/* Keep flushing packets until we reach a drop time greater than now. */
	s = xtfs->w_saved;
	se = s + savedlen;
	do {
		/* Walking past empty slots until we reach a packet */
		for (; s < se && !s->skb; s++) {
			if (s->drop_time > now)
				goto outerdone;
		}
		/* Sending packets until we hit another empty slot. */
		for (; s < se && s->skb; scount++, s++)
			list_add_tail(&s->skb->list, list);
	} while (s < se);
outerdone:

	count = s - xtfs->w_saved;
	if (count) {
		xtfs->w_wantseq += count;

		/* Shift handled slots plus final empty slot into slot 0. */
		__vec_shift(xtfs, count);
	}

	if (xtfs->w_savedlen) {
set_timer:
		/* Drifting is OK */
		hrtimer_start(&xtfs->drop_timer,
			      xtfs->w_saved[0].drop_time - now,
			      IPTFS_HRTIMER_MODE);
	}
	return scount;
}

static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
			   struct list_head *list)
{
	struct skb_wseq *s, *se;
	const u32 savedlen = xtfs->w_savedlen;
	u32 count = 0;

	/* Got what we wanted. */
	list_add_tail(&inskb->list, list);
	++xtfs->w_wantseq;
	if (!savedlen)
		return;

	/* Flush remaining consecutive packets. */

	/* Keep sending until we hit another missed pkt. */
	for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++)
		list_add_tail(&s->skb->list, list);
	count = s - xtfs->w_saved;
	if (count)
		xtfs->w_wantseq += count;

	/* Shift handled slots plus final empty slot into slot 0. */
	__vec_shift(xtfs, count + 1);
}

/* Set the slot's drop time and all the empty slots below it until reaching a
 * filled slot which will already be set.
 */
	return -EINPROGRESS;
static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index)
{
	const u32 savedlen = xtfs->w_savedlen;
	struct skb_wseq *s = xtfs->w_saved;
	time64_t drop_time;

	assert_spin_locked(&xtfs->drop_lock);

	if (savedlen > index + 1) {
		/* we are below another, our drop time and the timer are already set */
		return;
	}
	/* we are the most future so get a new drop time. */
	drop_time = ktime_get_raw_fast_ns();
	drop_time += xtfs->drop_time_ns;

	/* Walk back through the array setting drop times as we go */
	s[index].drop_time = drop_time;
	while (index-- > 0 && !s[index].skb)
		s[index].drop_time = drop_time;

	/* If we walked all the way back, schedule the drop timer if needed */
	if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer))
		hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns,
			      IPTFS_HRTIMER_MODE);
}

static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs,
				  struct sk_buff *inskb,
				  struct list_head *freelist)
{
	const u64 inseq = __esp_seq(inskb);
	const u64 wantseq = xtfs->w_wantseq;
	const u64 distance = inseq - wantseq;
	const u32 savedlen = xtfs->w_savedlen;
	const u32 index = distance - 1;

	/* Handle future sequence number received which fits in the window.
	 *
	 * We know we don't have the seq we want so we won't be able to flush
	 * anything.
	 */

	/* slot count is 4, saved size is 3 savedlen is 2
	 *
	 * "window boundary" is based on the fixed window size
	 * distance is also slot number
	 * index is an array index (i.e., - 1 of slot)
	 * : : - implicit NULL after array len
	 *
	 *          +--------- used length (savedlen == 2)
	 *          |   +----- array size (nslots - 1 == 3)
	 *          |   |   + window boundary (nslots == 4)
	 *          V   V | V
	 *                |
	 *  0   1   2   3 |   slot number
	 * ---  0   1   2 |   array index
	 *     [-] [b] : :|   array
	 *
	 * "2" "3" "4" *5*|   seq numbers
	 *
	 * We receive seq number 5
	 * distance == 3 [inseq(5) - w_wantseq(2)]
	 * index == 2 [distance(6) - 1]
	 */

	if (xtfs->w_saved[index].skb) {
		/* a dup of a future */
		list_add_tail(&inskb->list, freelist);
		return;
	}

	xtfs->w_saved[index].skb = inskb;
	xtfs->w_savedlen = max(savedlen, index + 1);
	iptfs_set_window_drop_times(xtfs, index);
}

static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
				    struct sk_buff *inskb,
				    struct list_head *list)
{
	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
	const u64 inseq = __esp_seq(inskb);
	u32 savedlen = xtfs->w_savedlen;
	u64 wantseq = xtfs->w_wantseq;
	struct skb_wseq *wnext;
	struct sk_buff *slot0;
	u32 beyond, shifting, slot;
	u64 distance;

	/* Handle future sequence number received.
	 *
	 * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
	 * b/c we are beyond the window boundary.
	 *
	 * We know we don't have the wantseq so that counts as a drop.
	 */

	/* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is
	 * the missing sequence number.
	 *
	 * the final slot at savedlen (index savedlen - 1) is always occupied.
	 *
	 * beyond is "beyond array size" not savedlen.
	 *
	 *          +--------- array length (savedlen == 2)
	 *          |   +----- array size (nslots - 1 == 3)
	 *          |   | +- window boundary (nslots == 4)
	 *          V   V |
	 *                |
	 *  0   1   2   3 |   slot number
	 * ---  0   1   2 |   array index
	 *     [b] [c] : :|   array
	 *                |
	 * "2" "3" "4" "5"|*6*  seq numbers
	 *
	 * We receive seq number 6
	 * distance == 4 [inseq(6) - w_wantseq(2)]
	 * newslot == distance
	 * index == 3 [distance(4) - 1]
	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
	 * shifting == 1 [min(savedlen(2), beyond(1)]
	 * slot0_skb == [b], and should match w_wantseq
	 *
	 *                +--- window boundary (nslots == 4)
	 *  0   1   2   3 | 4   slot number
	 * ---  0   1   2 | 3   array index
	 *     [b] : : : :|     array
	 * "2" "3" "4" "5" *6*  seq numbers
	 *
	 * We receive seq number 6
	 * distance == 4 [inseq(6) - w_wantseq(2)]
	 * newslot == distance
	 * index == 3 [distance(4) - 1]
	 * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
	 * shifting == 1 [min(savedlen(1), beyond(1)]
	 * slot0_skb == [b] and should match w_wantseq
	 *
	 *                +-- window boundary (nslots == 4)
	 *  0   1   2   3 | 4   5   6   slot number
	 * ---  0   1   2 | 3   4   5   array index
	 *     [-] [c] : :|             array
	 * "2" "3" "4" "5" "6" "7" *8*  seq numbers
	 *
	 * savedlen = 2, beyond = 3
	 * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
	 * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
	 * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
	 *
	 * We receive seq number 8
	 * distance == 6 [inseq(8) - w_wantseq(2)]
	 * newslot == distance
	 * index == 5 [distance(6) - 1]
	 * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
	 * shifting == 2 [min(savedlen(2), beyond(3)]
	 *
	 * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
	 */

	/* Now send any packets that are being shifted out of saved, and account
	 * for missing packets that are exiting the window as we shift it.
	 */

	distance = inseq - wantseq;
	beyond = distance - (nslots - 1);

	/* If savedlen > beyond we are shifting some, else all. */
	shifting = min(savedlen, beyond);

	/* slot0 is the buf that just shifted out and into slot0 */
	slot0 = NULL;
	wnext = xtfs->w_saved;
	for (slot = 1; slot <= shifting; slot++, wnext++) {
		/* handle what was in slot0 before we occupy it */
		if (slot0)
			list_add_tail(&slot0->list, list);
		slot0 = wnext->skb;
		wnext->skb = NULL;
	}

	/* slot0 is now either NULL (in which case it's what we now are waiting
	 * for, or a buf in which case we need to handle it like we received it;
	 * however, we may be advancing past that buffer as well..
	 */

	/* Handle case where we need to shift more than we had saved, slot0 will
	 * be NULL iff savedlen is 0, otherwise slot0 will always be
	 * non-NULL b/c we shifted the final element, which is always set if
	 * there is any saved, into slot0.
	 */
	if (savedlen < beyond) {
		if (savedlen != 0)
			list_add_tail(&slot0->list, list);
		slot0 = NULL;
		/* slot0 has had an empty slot pushed into it */
	}

	/* Remove the entries */
	__vec_shift(xtfs, beyond);

	/* Advance want seq */
	xtfs->w_wantseq += beyond;

	/* Process drops here when implementing congestion control */

	/* We've shifted. plug the packet in at the end. */
	xtfs->w_savedlen = nslots - 1;
	xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb;
	iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1);

	/* if we don't have a slot0 then we must wait for it */
	if (!slot0)
		return;

	/* If slot0, seq must match new want seq */

	/* slot0 is valid, treat like we received expected. */
	__reorder_this(xtfs, slot0, list);
}

/* Receive a new packet into the reorder window. Return a list of ordered
 * packets from the window.
 */
static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs,
				struct sk_buff *inskb, struct list_head *list,
				struct list_head *freelist)
{
	const u32 nslots = xtfs->cfg.reorder_win_size + 1;
	u64 inseq = __esp_seq(inskb);
	u64 wantseq;

	assert_spin_locked(&xtfs->drop_lock);

	if (unlikely(!xtfs->w_seq_set)) {
		xtfs->w_seq_set = true;
		xtfs->w_wantseq = inseq;
	}
	wantseq = xtfs->w_wantseq;

	if (likely(inseq == wantseq))
		__reorder_this(xtfs, inskb, list);
	else if (inseq < wantseq)
		__reorder_past(xtfs, inskb, freelist);
	else if ((inseq - wantseq) < nslots)
		__reorder_future_fits(xtfs, inskb, freelist);
	else
		__reorder_future_shifts(xtfs, inskb, list);
}

/**
@@ -1238,23 +1606,92 @@ static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
 */
static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me)
{
	struct sk_buff *skb, *next;
	struct list_head list;
	struct xfrm_iptfs_data *xtfs;
	struct sk_buff *skb;
	struct xfrm_state *x;
	u32 count;

	xtfs = container_of(me, typeof(*xtfs), drop_timer);
	x = xtfs->x;

	INIT_LIST_HEAD(&list);

	/* Drop any in progress packet */
	spin_lock(&xtfs->drop_lock);

	/* Drop any in progress packet */
	skb = xtfs->ra_newskb;
	xtfs->ra_newskb = NULL;

	/* Now drop as many packets as we should from the reordering window
	 * saved array
	 */
	count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0;

	spin_unlock(&xtfs->drop_lock);

	if (skb)
		kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);

	if (count) {
		list_for_each_entry_safe(skb, next, &list, list) {
			skb_list_del_init(skb);
			iptfs_input_ordered(x, skb);
		}
	}

	return HRTIMER_NORESTART;
}

/**
 * iptfs_input() - handle receipt of iptfs payload
 * @x: xfrm state
 * @skb: the packet
 *
 * We have an IPTFS payload order it if needed, then process newly in order
 * packets.
 *
 * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb.
 */
static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
{
	struct list_head freelist, list;
	struct xfrm_iptfs_data *xtfs = x->mode_data;
	struct sk_buff *next;

	/* Fast path for no reorder window. */
	if (xtfs->cfg.reorder_win_size == 0) {
		iptfs_input_ordered(x, skb);
		goto done;
	}

	/* Fetch list of in-order packets from the reordering window as well as
	 * a list of buffers we need to now free.
	 */
	INIT_LIST_HEAD(&list);
	INIT_LIST_HEAD(&freelist);

	spin_lock(&xtfs->drop_lock);
	iptfs_input_reorder(xtfs, skb, &list, &freelist);
	spin_unlock(&xtfs->drop_lock);

	list_for_each_entry_safe(skb, next, &list, list) {
		skb_list_del_init(skb);
		iptfs_input_ordered(x, skb);
	}

	list_for_each_entry_safe(skb, next, &freelist, list) {
		skb_list_del_init(skb);
		kfree_skb(skb);
	}
done:
	/* We always have dealt with the input SKB, either we are re-using it,
	 * or we have freed it. Return EINPROGRESS so that xfrm_input stops
	 * processing it.
	 */
	return -EINPROGRESS;
}

/* ================================= */
/* IPTFS Sending (ingress) Functions */
/* ================================= */
@@ -2009,11 +2446,24 @@ static int iptfs_user_init(struct net *net, struct xfrm_state *x,

	xc = &xtfs->cfg;
	xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
	xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW;
	xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC;
	xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;

	if (attrs[XFRMA_IPTFS_DONT_FRAG])
		xc->dont_frag = true;
	if (attrs[XFRMA_IPTFS_REORDER_WINDOW])
		xc->reorder_win_size =
			nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]);
	/* saved array is for saving 1..N seq nums from wantseq */
	if (xc->reorder_win_size) {
		xtfs->w_saved = kcalloc(xc->reorder_win_size,
					sizeof(*xtfs->w_saved), GFP_KERNEL);
		if (!xtfs->w_saved) {
			NL_SET_ERR_MSG(extack, "Cannot alloc reorder window");
			return -ENOMEM;
		}
	}
	if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
		xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
		if (!xc->pkt_size) {
@@ -2051,6 +2501,7 @@ static unsigned int iptfs_sa_len(const struct xfrm_state *x)

	if (x->dir == XFRM_SA_DIR_IN) {
		l += nla_total_size(sizeof(u32)); /* drop time usec */
		l += nla_total_size(sizeof(xc->reorder_win_size));
	} else {
		if (xc->dont_frag)
			l += nla_total_size(0);	  /* dont-frag flag */
@@ -2073,6 +2524,11 @@ static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
		q = xtfs->drop_time_ns;
		do_div(q, NSECS_IN_USEC);
		ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q);
		if (ret)
			return ret;

		ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW,
				  xc->reorder_win_size);
	} else {
		if (xc->dont_frag) {
			ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
@@ -2134,6 +2590,14 @@ static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig)
	xtfs->x = x;

	xtfs->ra_newskb = NULL;
	if (xtfs->cfg.reorder_win_size) {
		xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size,
					sizeof(*xtfs->w_saved), GFP_KERNEL);
		if (!xtfs->w_saved) {
			kfree_sensitive(xtfs);
			return -ENOMEM;
		}
	}

	return 0;
}
@@ -2160,6 +2624,7 @@ static void iptfs_destroy_state(struct xfrm_state *x)
{
	struct xfrm_iptfs_data *xtfs = x->mode_data;
	struct sk_buff_head list;
	struct skb_wseq *s, *se;
	struct sk_buff *skb;

	if (!xtfs)
@@ -2181,6 +2646,12 @@ static void iptfs_destroy_state(struct xfrm_state *x)
	if (xtfs->ra_newskb)
		kfree_skb(xtfs->ra_newskb);

	for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) {
		if (s->skb)
			kfree_skb(s->skb);
	}

	kfree_sensitive(xtfs->w_saved);
	kfree_sensitive(xtfs);

	module_put(x->mode_cbs->owner);