Commit 4eb559dd authored by Darrick J. Wong's avatar Darrick J. Wong
Browse files

Merge tag 'refcount-cow-domain-6.1_2022-10-31' of...

Merge tag 'refcount-cow-domain-6.1_2022-10-31' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux

 into xfs-6.1-fixesA

xfs: improve runtime refcountbt corruption detection

Fuzz testing of the refcount btree demonstrated a weakness in validation
of refcount btree records during normal runtime.  The idea of using the
upper bit of the rc_startblock field to separate the refcount records
into one group for shared space and another for CoW staging extents was
added at the last minute.  The incore struct left this bit encoded in
the upper bit of the startblock field, which makes it all too easy for
arithmetic operations to overflow if we don't detect the cowflag
properly.

When I ran a norepair fuzz tester, I was able to crash the kernel on one
of these accidental overflows by fuzzing a key record in a node block,
which broke lookups.  To fix the problem, make the domain (shared/cow) a
separate field in the incore record.

Unfortunately, a customer also hit this once in production.  Due to bugs
in the kernel running on the VM host, writes to the disk image would
occasionally be lost.  Given sufficient memory pressure on the VM guest,
a refcountbt xfs_buf could be reclaimed and later reloaded from the
stale copy on the virtual disk.  The stale disk contents were a refcount
btree leaf block full of records for the wrong domain, and this caused
an infinite loop in the guest VM.

v2: actually include the refcount adjust loop invariant checking patch;
    move the deferred refcount continuation checks earlier in the series;
    break up the megapatch into smaller pieces; fix an uninitialized list
    error.
v3: in the continuation check patch, verify the per-ag extent before
    converting it to a fsblock

Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>

* tag 'refcount-cow-domain-6.1_2022-10-31' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: rename XFS_REFC_COW_START to _COWFLAG
  xfs: fix uninitialized list head in struct xfs_refcount_recovery
  xfs: fix agblocks check in the cow leftover recovery function
  xfs: check record domain when accessing refcount records
  xfs: remove XFS_FIND_RCEXT_SHARED and _COW
  xfs: refactor domain and refcount checking
  xfs: report refcount domain in tracepoints
  xfs: track cow/shared record domains explicitly in xfs_refcount_irec
  xfs: refactor refcount record usage in xchk_refcountbt_rec
  xfs: move _irec structs to xfs_types.h
  xfs: check deferred refcount op continuation parameters
  xfs: create a predicate to verify per-AG extents
  xfs: make sure aglen never goes negative in xfs_refcount_adjust_extents
parents 9f187ba0 8b972158
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -133,6 +133,21 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno)
	return true;
}

static inline bool
xfs_verify_agbext(
	struct xfs_perag	*pag,
	xfs_agblock_t		agbno,
	xfs_agblock_t		len)
{
	if (agbno + len <= agbno)
		return false;

	if (!xfs_verify_agbno(pag, agbno))
		return false;

	return xfs_verify_agbno(pag, agbno + len - 1);
}

/*
 * Verify that an AG inode number pointer neither points outside the AG
 * nor points at static metadata.
+1 −5
Original line number Diff line number Diff line
@@ -263,11 +263,7 @@ xfs_alloc_get_rec(
		goto out_bad_rec;

	/* check for valid extent range, including overflow */
	if (!xfs_verify_agbno(pag, *bno))
		goto out_bad_rec;
	if (*bno > *bno + *len)
		goto out_bad_rec;
	if (!xfs_verify_agbno(pag, *bno + *len - 1))
	if (!xfs_verify_agbext(pag, *bno, *len))
		goto out_bad_rec;

	return 0;
+1 −21
Original line number Diff line number Diff line
@@ -1564,20 +1564,6 @@ struct xfs_rmap_rec {
#define RMAPBT_UNUSED_OFFSET_BITLEN	7
#define RMAPBT_OFFSET_BITLEN		54

#define XFS_RMAP_ATTR_FORK		(1 << 0)
#define XFS_RMAP_BMBT_BLOCK		(1 << 1)
#define XFS_RMAP_UNWRITTEN		(1 << 2)
#define XFS_RMAP_KEY_FLAGS		(XFS_RMAP_ATTR_FORK | \
					 XFS_RMAP_BMBT_BLOCK)
#define XFS_RMAP_REC_FLAGS		(XFS_RMAP_UNWRITTEN)
struct xfs_rmap_irec {
	xfs_agblock_t	rm_startblock;	/* extent start block */
	xfs_extlen_t	rm_blockcount;	/* extent length */
	uint64_t	rm_owner;	/* extent owner */
	uint64_t	rm_offset;	/* offset within the owner */
	unsigned int	rm_flags;	/* state flags */
};

/*
 * Key structure
 *
@@ -1626,7 +1612,7 @@ unsigned int xfs_refc_block(struct xfs_mount *mp);
 * on the startblock.  This speeds up mount time deletion of stale
 * staging extents because they're all at the right side of the tree.
 */
#define XFS_REFC_COW_START		((xfs_agblock_t)(1U << 31))
#define XFS_REFC_COWFLAG		(1U << 31)
#define REFCNTBT_COWFLAG_BITLEN		1
#define REFCNTBT_AGBLOCK_BITLEN		31

@@ -1640,12 +1626,6 @@ struct xfs_refcount_key {
	__be32		rc_startblock;	/* starting block number */
};

struct xfs_refcount_irec {
	xfs_agblock_t	rc_startblock;	/* starting block number */
	xfs_extlen_t	rc_blockcount;	/* count of free blocks */
	xfs_nlink_t	rc_refcount;	/* number of inodes linked here */
};

#define MAXREFCOUNT	((xfs_nlink_t)~0U)
#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)

+199 −87

File changed.

Preview size limit exceeded, changes collapsed.

+36 −4
Original line number Diff line number Diff line
@@ -14,14 +14,33 @@ struct xfs_bmbt_irec;
struct xfs_refcount_irec;

extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
		xfs_agblock_t bno, int *stat);
		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
		xfs_agblock_t bno, int *stat);
		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
		xfs_agblock_t bno, int *stat);
		enum xfs_refc_domain domain, xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
		struct xfs_refcount_irec *irec, int *stat);

static inline uint32_t
xfs_refcount_encode_startblock(
	xfs_agblock_t		startblock,
	enum xfs_refc_domain	domain)
{
	uint32_t		start;

	/*
	 * low level btree operations need to handle the generic btree range
	 * query functions (which set rc_domain == -1U), so we check that the
	 * domain is /not/ shared.
	 */
	start = startblock & ~XFS_REFC_COWFLAG;
	if (domain != XFS_REFC_DOMAIN_SHARED)
		start |= XFS_REFC_COWFLAG;

	return start;
}

enum xfs_refcount_intent_type {
	XFS_REFCOUNT_INCREASE = 1,
	XFS_REFCOUNT_DECREASE,
@@ -36,6 +55,18 @@ struct xfs_refcount_intent {
	xfs_fsblock_t				ri_startblock;
};

/* Check that the refcount is appropriate for the record domain. */
static inline bool
xfs_refcount_check_domain(
	const struct xfs_refcount_irec	*irec)
{
	if (irec->rc_domain == XFS_REFC_DOMAIN_COW && irec->rc_refcount != 1)
		return false;
	if (irec->rc_domain == XFS_REFC_DOMAIN_SHARED && irec->rc_refcount < 2)
		return false;
	return true;
}

void xfs_refcount_increase_extent(struct xfs_trans *tp,
		struct xfs_bmbt_irec *irec);
void xfs_refcount_decrease_extent(struct xfs_trans *tp,
@@ -79,7 +110,8 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
#define XFS_REFCOUNT_ITEM_OVERHEAD	32

extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
		xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
		enum xfs_refc_domain domain, xfs_agblock_t bno,
		xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
		struct xfs_refcount_irec *irec);
Loading