Commit 2167eaab authored by Christoph Hellwig's avatar Christoph Hellwig
Browse files

xfs: define the zoned on-disk format



Zone file systems reuse the basic RT group enabled XFS file system
structure to support a mode where each RT group is always written from
start to end and then reset for reuse (after moving out any remaining
data).  There are few minor but important changes, which are indicated
by a new incompat flag:

1) there are no bitmap and summary inodes, thus the
   /rtgroups/{rgno}.{bitmap,summary} metadir files do not exist and the
   sb_rbmblocks superblock field must be cleared to zero.

2) there is a new superblock field that specifies the start of an
   internal RT section.  This allows supporting SMR HDDs that have random
   writable space at the beginning which is used for the XFS data device
   (which really is the metadata device for this configuration), directly
   followed by a RT device on the same block device.  While something
   similar could be achieved using dm-linear just having a single device
   directly consumed by XFS makes handling the file systems a lot easier.

3) Another superblock field that tracks the amount of reserved space (or
   overprovisioning) that is never used for user capacity, but allows GC
   to run more smoothly.

4) an overlay of the cowextsize field for the rtrmap inode so that we
   can persistently track the total amount of rtblocks currently used in
   a RT group.  There is no data structure other than the rmap that
   tracks used space in an RT group, and this counter is used to decide
   when a RT group has been entirely emptied, and to select one that
   is relatively empty if garbage collection needs to be performed.
   While this counter could be tracked entirely in memory and rebuilt
   from the rmap at mount time, that would lead to very long mount times
   with the large number of RT groups implied by the number of hardware
   zones especially on SMR hard drives with 256MB zone sizes.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatar"Darrick J. Wong" <djwong@kernel.org>
parent aacde95a
Loading
Loading
Loading
Loading
+12 −3
Original line number Diff line number Diff line
@@ -178,9 +178,10 @@ typedef struct xfs_sb {

	xfs_rgnumber_t	sb_rgcount;	/* number of realtime groups */
	xfs_rtxlen_t	sb_rgextents;	/* size of a realtime group in rtx */

	uint8_t		sb_rgblklog;    /* rt group number shift */
	uint8_t		sb_pad[7];	/* zeroes */
	xfs_rfsblock_t	sb_rtstart;	/* start of internal RT section (FSB) */
	xfs_filblks_t	sb_rtreserved;	/* reserved (zoned) RT blocks */

	/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
	__be64		sb_metadirino;	/* metadata directory tree root */
	__be32		sb_rgcount;	/* # of realtime groups */
	__be32		sb_rgextents;	/* size of rtgroup in rtx */

	__u8		sb_rgblklog;    /* rt group number shift */
	__u8		sb_pad[7];	/* zeroes */
	__be64		sb_rtstart;	/* start of internal RT section (FSB) */
	__be64		sb_rtreserved;	/* reserved (zoned) RT blocks */

	/*
	 * The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,8 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE	(1 << 6)  /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT	(1 << 7)  /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR	(1 << 8)  /* metadata dir tree */
#define XFS_SB_FEAT_INCOMPAT_ZONED	(1 << 9)  /* zoned RT allocator */

#define XFS_SB_FEAT_INCOMPAT_ALL \
		(XFS_SB_FEAT_INCOMPAT_FTYPE | \
		 XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -952,7 +956,12 @@ struct xfs_dinode {
	__be64		di_changecount;	/* number of attribute changes */
	__be64		di_lsn;		/* flush sequence */
	__be64		di_flags2;	/* more random flags */
	__be32		di_cowextsize;	/* basic cow extent size for file */
	union {
		/* basic cow extent size for (regular) file */
		__be32		di_cowextsize;
		/* used blocks in RTG for (zoned) rtrmap inode */
		__be32		di_used_blocks;
	};
	__u8		di_pad2[12];	/* more padding for future expansion */

	/* fields only written to during inode creation */
+16 −5
Original line number Diff line number Diff line
@@ -252,7 +252,10 @@ xfs_inode_from_disk(
					   be64_to_cpu(from->di_changecount));
		ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
		ip->i_diflags2 = be64_to_cpu(from->di_flags2);
		/* also covers the di_used_blocks union arm: */
		ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
		BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
			     sizeof(from->di_used_blocks));
	}

	error = xfs_iformat_data_fork(ip, from);
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
		to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
		to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
		to->di_flags2 = cpu_to_be64(ip->i_diflags2);
		/* also covers the di_used_blocks union arm: */
		to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
		to->di_ino = cpu_to_be64(ip->i_ino);
		to->di_lsn = cpu_to_be64(lsn);
@@ -752,11 +756,18 @@ xfs_dinode_verify(
	    !xfs_has_rtreflink(mp))
		return __this_address;

	if (xfs_has_zoned(mp) &&
	    dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
		if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
			return __this_address;
	} else {
		/* COW extent size hint validation */
	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
		fa = xfs_inode_validate_cowextsize(mp,
				be32_to_cpu(dip->di_cowextsize),
				mode, flags, flags2);
		if (fa)
			return fa;
	}

	/* bigtime iflag can only happen on bigtime filesystems */
	if (xfs_dinode_has_bigtime(dip) &&
+1 −0
Original line number Diff line number Diff line
@@ -322,6 +322,7 @@ xfs_inode_init(

	if (xfs_has_v3inodes(mp)) {
		inode_set_iversion(inode, 1);
		/* also covers the di_used_blocks union arm: */
		ip->i_cowextsize = 0;
		times |= XFS_ICHGTIME_CREATE;
	}
+6 −1
Original line number Diff line number Diff line
@@ -475,7 +475,12 @@ struct xfs_log_dinode {
	xfs_lsn_t	di_lsn;

	uint64_t	di_flags2;	/* more random flags */
	uint32_t	di_cowextsize;	/* basic cow extent size for file */
	union {
		/* basic cow extent size for (regular) file */
		uint32_t		di_cowextsize;
		/* used blocks in RTG for (zoned) rtrmap inode */
		uint32_t		di_used_blocks;
	};
	uint8_t		di_pad2[12];	/* more padding for future expansion */

	/* fields only written to during inode creation */
+4 −2
Original line number Diff line number Diff line
@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
			16299260424LL);

	/* superblock field checks we got from xfs/122 */
	XFS_CHECK_STRUCT_SIZE(struct xfs_dsb,		288);
	XFS_CHECK_STRUCT_SIZE(struct xfs_sb,		288);
	XFS_CHECK_STRUCT_SIZE(struct xfs_dsb,		304);
	XFS_CHECK_STRUCT_SIZE(struct xfs_sb,		304);
	XFS_CHECK_SB_OFFSET(sb_magicnum,		0);
	XFS_CHECK_SB_OFFSET(sb_blocksize,		4);
	XFS_CHECK_SB_OFFSET(sb_dblocks,			8);
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
	XFS_CHECK_SB_OFFSET(sb_rgextents,		276);
	XFS_CHECK_SB_OFFSET(sb_rgblklog,		280);
	XFS_CHECK_SB_OFFSET(sb_pad,			281);
	XFS_CHECK_SB_OFFSET(sb_rtstart,			288);
	XFS_CHECK_SB_OFFSET(sb_rtreserved,		296);
}

#endif /* __XFS_ONDISK_H */
Loading