Commit 4c6283ec authored by Carlos Maiolino's avatar Carlos Maiolino
Browse files

Merge tag 'xfs-zoned-allocator-2025-03-03' of...

Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices

xfs: add support for zoned devices

Add support for the new zoned space allocator and thus for zoned devices:

    https://zonedstorage.io/docs/introduction/zoned-storage

to XFS. This has been developed for and tested on both SMR hard drives,
which are the oldest and most common class of zoned devices:

   https://zonedstorage.io/docs/introduction/smr

and ZNS SSDs:

   https://zonedstorage.io/docs/introduction/zns

It has not been tested with zoned UFS devices, as their current capacity
points and performance characteristics aren't too interesting for XFS
use cases (but never say never).

Sequential write only zones are only supported for data using a new
allocator for the RT device, which maps each zone to a rtgroup which
is written sequentially.  All metadata and (for now) the log require
using randomly writable space. This means a realtime device is required
to support zoned storage, but for the common case of SMR hard drives
that contain random writable zones and sequential write required zones
on the same block device, the concept of an internal RT device is added
which means using XFS on a SMR HDD is as simple as:

$ mkfs.xfs /dev/sda
$ mount /dev/sda /mnt

When using NVMe ZNS SSDs that do not support conventional zones, the
traditional multi-device RT configuration is required.  E.g. for an
SSD with a conventional namespace 1 and a zoned namespace 2:

$ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2
$ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt

The zoned allocator can also be used on conventional block devices, or
on conventional zones (e.g. when using an SMR HDD as the external RT
device).  For example using zoned XFS on normal SSDs shows very nice
performance advantages and write amplification reduction for intelligent
workloads like RocksDB.

Some work is still in progress or planned, but should not affect the
integration with the rest of XFS or the on-disk format:

 - support for quotas
 - support for reflinks

Note that the I/O path already supports reflink, but garbage collection
isn't refcount aware yet and would unshare shared blocks, thus rendering
the feature useless.
parents 0a1fd780 9c477912
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT)		+= $(addprefix libxfs/, \
				   xfs_rtbitmap.o \
				   xfs_rtgroup.o \
				   xfs_zones.o \
				   )

# highlevel code
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
				   xfs_quotaops.o

# xfs_rtbitmap is shared with libxfs
xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o
xfs-$(CONFIG_XFS_RT)		+= xfs_rtalloc.o \
				   xfs_zone_alloc.o \
				   xfs_zone_gc.o \
				   xfs_zone_info.o \
				   xfs_zone_space_resv.o

xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
+23 −293
Original line number Diff line number Diff line
@@ -34,13 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
#include "xfs_zone_alloc.h"

struct kmem_cache		*xfs_bmap_intent_cache;

@@ -171,18 +171,16 @@ xfs_bmbt_update(
 * Compute the worst-case number of indirect blocks that will be used
 * for ip's delayed extent of length "len".
 */
STATIC xfs_filblks_t
xfs_filblks_t
xfs_bmap_worst_indlen(
	xfs_inode_t	*ip,		/* incore inode pointer */
	struct xfs_inode	*ip,		/* incore inode pointer */
	xfs_filblks_t		len)		/* delayed extent length */
{
	int		level;		/* btree level number */
	int		maxrecs;	/* maximum record count at this level */
	xfs_mount_t	*mp;		/* mount structure */
	xfs_filblks_t	rval;		/* return value */
	struct xfs_mount	*mp = ip->i_mount;
	int			maxrecs = mp->m_bmap_dmxr[0];
	int			level;
	xfs_filblks_t		rval;

	mp = ip->i_mount;
	maxrecs = mp->m_bmap_dmxr[0];
	for (level = 0, rval = 0;
	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
	     level++) {
@@ -2571,146 +2569,6 @@ xfs_bmap_add_extent_unwritten_real(
#undef	PREV
}

/*
 * Convert a hole to a delayed allocation.
 */
STATIC void
xfs_bmap_add_extent_hole_delay(
	xfs_inode_t		*ip,	/* incore inode pointer */
	int			whichfork,
	struct xfs_iext_cursor	*icur,
	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
{
	struct xfs_ifork	*ifp;	/* inode fork pointer */
	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
	xfs_filblks_t		newlen=0;	/* new indirect size */
	xfs_filblks_t		oldlen=0;	/* old indirect size */
	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
	uint32_t		state = xfs_bmap_fork_to_state(whichfork);
	xfs_filblks_t		temp;	 /* temp for indirect calculations */

	ifp = xfs_ifork_ptr(ip, whichfork);
	ASSERT(isnullstartblock(new->br_startblock));

	/*
	 * Check and set flags if this segment has a left neighbor
	 */
	if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
		state |= BMAP_LEFT_VALID;
		if (isnullstartblock(left.br_startblock))
			state |= BMAP_LEFT_DELAY;
	}

	/*
	 * Check and set flags if the current (right) segment exists.
	 * If it doesn't exist, we're converting the hole at end-of-file.
	 */
	if (xfs_iext_get_extent(ifp, icur, &right)) {
		state |= BMAP_RIGHT_VALID;
		if (isnullstartblock(right.br_startblock))
			state |= BMAP_RIGHT_DELAY;
	}

	/*
	 * Set contiguity flags on the left and right neighbors.
	 * Don't let extents get too large, even if the pieces are contiguous.
	 */
	if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
	    left.br_startoff + left.br_blockcount == new->br_startoff &&
	    left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
		state |= BMAP_LEFT_CONTIG;

	if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
	    new->br_startoff + new->br_blockcount == right.br_startoff &&
	    new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
	    (!(state & BMAP_LEFT_CONTIG) ||
	     (left.br_blockcount + new->br_blockcount +
	      right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
		state |= BMAP_RIGHT_CONTIG;

	/*
	 * Switch out based on the contiguity flags.
	 */
	switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
	case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
		/*
		 * New allocation is contiguous with delayed allocations
		 * on the left and on the right.
		 * Merge all three into a single extent record.
		 */
		temp = left.br_blockcount + new->br_blockcount +
			right.br_blockcount;

		oldlen = startblockval(left.br_startblock) +
			startblockval(new->br_startblock) +
			startblockval(right.br_startblock);
		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
					 oldlen);
		left.br_startblock = nullstartblock(newlen);
		left.br_blockcount = temp;

		xfs_iext_remove(ip, icur, state);
		xfs_iext_prev(ifp, icur);
		xfs_iext_update_extent(ip, state, icur, &left);
		break;

	case BMAP_LEFT_CONTIG:
		/*
		 * New allocation is contiguous with a delayed allocation
		 * on the left.
		 * Merge the new allocation with the left neighbor.
		 */
		temp = left.br_blockcount + new->br_blockcount;

		oldlen = startblockval(left.br_startblock) +
			startblockval(new->br_startblock);
		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
					 oldlen);
		left.br_blockcount = temp;
		left.br_startblock = nullstartblock(newlen);

		xfs_iext_prev(ifp, icur);
		xfs_iext_update_extent(ip, state, icur, &left);
		break;

	case BMAP_RIGHT_CONTIG:
		/*
		 * New allocation is contiguous with a delayed allocation
		 * on the right.
		 * Merge the new allocation with the right neighbor.
		 */
		temp = new->br_blockcount + right.br_blockcount;
		oldlen = startblockval(new->br_startblock) +
			startblockval(right.br_startblock);
		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
					 oldlen);
		right.br_startoff = new->br_startoff;
		right.br_startblock = nullstartblock(newlen);
		right.br_blockcount = temp;
		xfs_iext_update_extent(ip, state, icur, &right);
		break;

	case 0:
		/*
		 * New allocation is not contiguous with another
		 * delayed allocation.
		 * Insert a new entry.
		 */
		oldlen = newlen = 0;
		xfs_iext_insert(ip, icur, new, state);
		break;
	}
	if (oldlen != newlen) {
		ASSERT(oldlen > newlen);
		xfs_add_fdblocks(ip->i_mount, oldlen - newlen);

		/*
		 * Nothing to do for disk quota accounting here.
		 */
		xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
	}
}

/*
 * Convert a hole to a real allocation.
 */
@@ -4039,144 +3897,6 @@ xfs_bmapi_read(
	return 0;
}

/*
 * Add a delayed allocation extent to an inode. Blocks are reserved from the
 * global pool and the extent inserted into the inode in-core extent tree.
 *
 * On entry, got refers to the first extent beyond the offset of the extent to
 * allocate or eof is specified if no such extent exists. On return, got refers
 * to the extent record that was inserted to the inode fork.
 *
 * Note that the allocated extent may have been merged with contiguous extents
 * during insertion into the inode fork. Thus, got does not reflect the current
 * state of the inode fork on return. If necessary, the caller can use lastx to
 * look up the updated record in the inode fork.
 */
int
xfs_bmapi_reserve_delalloc(
	struct xfs_inode	*ip,
	int			whichfork,
	xfs_fileoff_t		off,
	xfs_filblks_t		len,
	xfs_filblks_t		prealloc,
	struct xfs_bmbt_irec	*got,
	struct xfs_iext_cursor	*icur,
	int			eof)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
	xfs_extlen_t		alen;
	xfs_extlen_t		indlen;
	uint64_t		fdblocks;
	int			error;
	xfs_fileoff_t		aoff;
	bool			use_cowextszhint =
					whichfork == XFS_COW_FORK && !prealloc;

retry:
	/*
	 * Cap the alloc length. Keep track of prealloc so we know whether to
	 * tag the inode before we return.
	 */
	aoff = off;
	alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
	if (!eof)
		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
	if (prealloc && alen >= len)
		prealloc = alen - len;

	/*
	 * If we're targetting the COW fork but aren't creating a speculative
	 * posteof preallocation, try to expand the reservation to align with
	 * the COW extent size hint if there's sufficient free space.
	 *
	 * Unlike the data fork, the CoW cancellation functions will free all
	 * the reservations at inactivation, so we don't require that every
	 * delalloc reservation have a dirty pagecache.
	 */
	if (use_cowextszhint) {
		struct xfs_bmbt_irec	prev;
		xfs_extlen_t		extsz = xfs_get_cowextsz_hint(ip);

		if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
			prev.br_startoff = NULLFILEOFF;

		error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
					       1, 0, &aoff, &alen);
		ASSERT(!error);
	}

	/*
	 * Make a transaction-less quota reservation for delayed allocation
	 * blocks.  This number gets adjusted later.  We return if we haven't
	 * allocated blocks already inside this loop.
	 */
	error = xfs_quota_reserve_blkres(ip, alen);
	if (error)
		goto out;

	/*
	 * Split changing sb for alen and indlen since they could be coming
	 * from different places.
	 */
	indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
	ASSERT(indlen > 0);

	fdblocks = indlen;
	if (XFS_IS_REALTIME_INODE(ip)) {
		error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
		if (error)
			goto out_unreserve_quota;
	} else {
		fdblocks += alen;
	}

	error = xfs_dec_fdblocks(mp, fdblocks, false);
	if (error)
		goto out_unreserve_frextents;

	ip->i_delayed_blks += alen;
	xfs_mod_delalloc(ip, alen, indlen);

	got->br_startoff = aoff;
	got->br_startblock = nullstartblock(indlen);
	got->br_blockcount = alen;
	got->br_state = XFS_EXT_NORM;

	xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);

	/*
	 * Tag the inode if blocks were preallocated. Note that COW fork
	 * preallocation can occur at the start or end of the extent, even when
	 * prealloc == 0, so we must also check the aligned offset and length.
	 */
	if (whichfork == XFS_DATA_FORK && prealloc)
		xfs_inode_set_eofblocks_tag(ip);
	if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
		xfs_inode_set_cowblocks_tag(ip);

	return 0;

out_unreserve_frextents:
	if (XFS_IS_REALTIME_INODE(ip))
		xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
out_unreserve_quota:
	if (XFS_IS_QUOTA_ON(mp))
		xfs_quota_unreserve_blkres(ip, alen);
out:
	if (error == -ENOSPC || error == -EDQUOT) {
		trace_xfs_delalloc_enospc(ip, off, len);

		if (prealloc || use_cowextszhint) {
			/* retry without any preallocation */
			use_cowextszhint = false;
			prealloc = 0;
			goto retry;
		}
	}
	return error;
}

static int
xfs_bmapi_allocate(
	struct xfs_bmalloca	*bma)
@@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay(
	int			whichfork,
	struct xfs_iext_cursor	*icur,
	struct xfs_bmbt_irec	*got,
	struct xfs_bmbt_irec	*del)
	struct xfs_bmbt_irec	*del,
	uint32_t		bflags)	/* bmapi flags */
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay(
	da_diff = da_old - da_new;
	fdblocks = da_diff;

	if (isrt)
		xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
	else
	if (bflags & XFS_BMAPI_REMAP) {
		;
	} else if (isrt) {
		xfs_rtbxlen_t	rtxlen;

		rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
		if (xfs_is_zoned_inode(ip))
			xfs_zoned_add_available(mp, rtxlen);
		xfs_add_frextents(mp, rtxlen);
	} else {
		fdblocks += del->br_blockcount;
	}

	xfs_add_fdblocks(mp, fdblocks);
	xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5670,7 +5399,8 @@ __xfs_bunmapi(

delete:
		if (wasdel) {
			xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
			xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
					&del, flags);
		} else {
			error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
					&del, &tmp_logflags, whichfork,
+2 −5
Original line number Diff line number Diff line
@@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
		xfs_extnum_t nexts, int *done);
void	xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
		struct xfs_bmbt_irec *del);
		struct xfs_bmbt_irec *del, uint32_t bflags);
void	xfs_bmap_del_extent_cow(struct xfs_inode *ip,
		struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
		struct xfs_bmbt_irec *del);
@@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
		bool *done, xfs_fileoff_t stop_fsb);
int	xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
		xfs_fileoff_t split_offset);
int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
		xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
		struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
		int eof);
int	xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
		xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int	xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
		int fork);
int	xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
		struct xfs_alloc_arg *args);
xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);

enum xfs_bmap_intent_type {
	XFS_BMAP_MAP = 1,
+16 −4
Original line number Diff line number Diff line
@@ -178,9 +178,10 @@ typedef struct xfs_sb {

	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
	xfs_rtxlen_t	sb_rgextents;	/* size of a realtime group in rtx */

	uint8_t		sb_rgblklog;    /* rt group number shift */
	uint8_t		sb_pad[7];	/* zeroes */
	xfs_rfsblock_t	sb_rtstart;	/* start of internal RT section (FSB) */
	xfs_filblks_t	sb_rtreserved;	/* reserved (zoned) RT blocks */

	/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
	__be64		sb_metadirino;	/* metadata directory tree root */
	__be32		sb_rgcount;	/* # of realtime groups */
	__be32		sb_rgextents;	/* size of rtgroup in rtx */

	__u8		sb_rgblklog;    /* rt group number shift */
	__u8		sb_pad[7];	/* zeroes */
	__be64		sb_rtstart;	/* start of internal RT section (FSB) */
	__be64		sb_rtreserved;	/* reserved (zoned) RT blocks */

	/*
	 * The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE	(1 << 6)  /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 7)  /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR	(1 << 8)  /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED	(1 << 9)  /* zoned RT allocator */
#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS	(1 << 10) /* RTGs have LBA gaps */

#define XFS_SB_FEAT_INCOMPAT_ALL \
		(XFS_SB_FEAT_INCOMPAT_FTYPE | \
		 XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
		 XFS_SB_FEAT_INCOMPAT_NREXT64 | \
		 XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
		 XFS_SB_FEAT_INCOMPAT_PARENT | \
		 XFS_SB_FEAT_INCOMPAT_METADIR)
		 XFS_SB_FEAT_INCOMPAT_METADIR | \
		 XFS_SB_FEAT_INCOMPAT_ZONED | \
		 XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)

#define XFS_SB_FEAT_INCOMPAT_UNKNOWN	~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -952,7 +959,12 @@ struct xfs_dinode {
	__be64		di_changecount;	/* number of attribute changes */
	__be64		di_lsn;		/* flush sequence */
	__be64		di_flags2;	/* more random flags */
	__be32		di_cowextsize;	/* basic cow extent size for file */
	union {
		/* basic cow extent size for (regular) file */
		__be32		di_cowextsize;
		/* used blocks in RTG for (zoned) rtrmap inode */
		__be32		di_used_blocks;
	};
	__u8		di_pad2[12];	/* more padding for future expansion */

	/* fields only written to during inode creation */
+13 −1
Original line number Diff line number Diff line
@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
	uint32_t	checked;	/* o: checked fs & rt metadata	*/
	__u32		rgextents;	/* rt extents in a realtime group */
	__u32		rgcount;	/* number of realtime groups	*/
	__u64		reserved[16];	/* reserved space		*/
	__u64		rtstart;	/* start of internal rt section */
	__u64		rtreserved;	/* RT (zoned) reserved blocks	*/
	__u64		reserved[14];	/* reserved space		*/
};

#define XFS_FSOP_GEOM_SICK_COUNTERS	(1 << 0)  /* summary counters */
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT	(1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR	(1 << 26) /* metadata directories */
#define XFS_FSOP_GEOM_FLAGS_ZONED	(1 << 27) /* zoned rt device */

/*
 * Minimum and maximum sizes need for growth checks.
@@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE	     _IOW ('X', 131, struct xfs_commit_range)
/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */

/*
 * Devices supported by a single XFS file system.  Reported in fsmaps fmr_device
 * when using internal RT devices.
 */
enum xfs_device {
	XFS_DEV_DATA	= 1,
	XFS_DEV_LOG	= 2,
	XFS_DEV_RT	= 3,
};

#ifndef HAVE_BBMACROS
/*
Loading