Commit 61ba89b5 authored by Gao Xiang's avatar Gao Xiang
Browse files

erofs: add 48-bit block addressing on-disk support



The current 32-bit block addressing limits EROFS to a 16TiB maximum
volume size with 4KiB blocks.  However, several new use cases now
require larger capacity support:
 - Massive datasets for model training in order to boost random
   sampling performance for each epoch;

 - Object storage clients using EROFS direct passthrough.

This extends core on-disk structures to support 48-bit block addressing,
such as inodes, device slots, and inode chunks.

Additionally:
 - Expand superblock root NID to 8-byte `rootnid_8b` to enable full
   out-of-place update incremental builds;

 - Introduce `epoch` field in the superblock as well as add `mtime`
   field to 32-byte compact inodes for basic timestamp support.

Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Acked-by: default avatarChao Yu <chao@kernel.org>
Link: https://lore.kernel.org/r/20250310095459.2620647-4-hsiangkao@linux.alibaba.com
parent 3422dfa9
Loading
Loading
Loading
Loading
+7 −8
Original line number Diff line number Diff line
@@ -95,7 +95,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)

		map->m_flags = EROFS_MAP_MAPPED;
		if (map->m_la < pos) {
			map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
			map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
			map->m_llen = pos - map->m_la;
		} else {
			map->m_pa = erofs_iloc(inode) + vi->inode_isize +
@@ -124,7 +124,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
	map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
			    round_up(inode->i_size - map->m_la, blksz));
	if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
		startblk = le32_to_cpu(idx->blkaddr);
		startblk = le32_to_cpu(idx->startblk_lo);
		if (startblk != EROFS_NULL_ADDR) {
			map->m_deviceid = le16_to_cpu(idx->device_id) &
				EROFS_SB(sb)->device_id_mask;
@@ -168,7 +168,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
	struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
	struct erofs_device_info *dif;
	erofs_off_t startoff, length;
	erofs_off_t startoff;
	int id;

	erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
@@ -181,7 +181,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
			return -ENODEV;
		}
		if (devs->flatdev) {
			map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
			map->m_pa += erofs_pos(sb, dif->uniaddr);
			up_read(&devs->rwsem);
			return 0;
		}
@@ -190,13 +190,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
	} else if (devs->extra_devices && !devs->flatdev) {
		down_read(&devs->rwsem);
		idr_for_each_entry(&devs->tree, dif, id) {
			if (!dif->mapped_blkaddr)
			if (!dif->uniaddr)
				continue;

			startoff = erofs_pos(sb, dif->mapped_blkaddr);
			length = erofs_pos(sb, dif->blocks);
			startoff = erofs_pos(sb, dif->uniaddr);
			if (map->m_pa >= startoff &&
			    map->m_pa < startoff + length) {
			    map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
				map->m_pa -= startoff;
				erofs_fill_from_devinfo(map, sb, dif);
				break;
+42 −49
Original line number Diff line number Diff line
@@ -30,25 +30,19 @@
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS	0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE		0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES	0x00000040
#define EROFS_FEATURE_INCOMPAT_48BIT		0x00000080
#define EROFS_ALL_FEATURE_INCOMPAT		\
	(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
	 EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
	 EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
	 EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
	 EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
	 EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
	 EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
	 EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
	 EROFS_FEATURE_INCOMPAT_DEDUPE | \
	 EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
	((EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES << 1) - 1)

#define EROFS_SB_EXTSLOT_SIZE	16

struct erofs_deviceslot {
	u8 tag[64];		/* digest(sha256), etc. */
	__le32 blocks;		/* total fs blocks of this device */
	__le32 mapped_blkaddr;	/* map starting at mapped_blkaddr */
	u8 reserved[56];
	__le32 blocks_lo;	/* total blocks count of this device */
	__le32 uniaddr_lo;	/* unified starting block of this device */
	__le32 blocks_hi;	/* total blocks count MSB */
	__le16 uniaddr_hi;	/* unified starting block MSB */
	u8 reserved[50];
};
#define EROFS_DEVT_SLOT_SIZE	sizeof(struct erofs_deviceslot)

@@ -59,13 +53,14 @@ struct erofs_super_block {
	__le32 feature_compat;
	__u8 blkszbits;         /* filesystem block size in bit shift */
	__u8 sb_extslots;	/* superblock size = 128 + sb_extslots * 16 */

	__le16 root_nid;	/* nid of root directory */
	union {
		__le16 rootnid_2b;	/* nid of root directory */
		__le16 blocks_hi;	/* (48BIT on) blocks count MSB */
	} rb;
	__le64 inos;            /* total valid ino # (== f_files - f_favail) */

	__le64 build_time;      /* compact inode time derivation */
	__le32 build_time_nsec;	/* compact inode time derivation in ns scale */
	__le32 blocks;          /* used for statfs */
	__le64 epoch;		/* base seconds used for compact inodes */
	__le32 fixed_nsec;	/* fixed nanoseconds for compact inodes */
	__le32 blocks_lo;	/* blocks count LSB */
	__le32 meta_blkaddr;	/* start block address of metadata area */
	__le32 xattr_blkaddr;	/* start block address of shared xattr area */
	__u8 uuid[16];          /* 128-bit uuid for volume */
@@ -84,7 +79,10 @@ struct erofs_super_block {
	__le32 xattr_prefix_start;	/* start of long xattr prefixes */
	__le64 packed_nid;	/* nid of the special packed inode */
	__u8 xattr_filter_reserved; /* reserved for xattr name filter */
	__u8 reserved2[23];
	__u8 reserved[3];
	__le32 build_time;	/* seconds added to epoch for mkfs time */
	__le64 rootnid_8b;	/* (48BIT on) nid of root directory */
	__u8 reserved2[8];
};

/*
@@ -117,17 +115,16 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)

#define EROFS_I_VERSION_BIT	0
#define EROFS_I_DATALAYOUT_BIT	1
#define EROFS_I_ALL_BIT			4

#define EROFS_I_ALL	((1 << EROFS_I_ALL_BIT) - 1)
#define EROFS_I_NLINK_1_BIT	4	/* non-directory compact inodes only */
#define EROFS_I_ALL		((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)

/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK		0x001F
/* with chunk indexes or just a 4-byte blkaddr array */
/* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES		0x0020
#define EROFS_CHUNK_FORMAT_48BIT		0x0040

#define EROFS_CHUNK_FORMAT_ALL	\
	(EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
#define EROFS_CHUNK_FORMAT_ALL	((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)

/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT	0
@@ -140,45 +137,40 @@ struct erofs_inode_chunk_info {
};

union erofs_inode_i_u {
	/* total compressed blocks for compressed inodes */
	__le32 compressed_blocks;

	/* block address for uncompressed flat inodes */
	__le32 raw_blkaddr;

	/* for device files, used to indicate old/new device # */
	__le32 rdev;

	/* for chunk-based files, it contains the summary info */
	__le32 blocks_lo;	/* total blocks count (if compressed inodes) */
	__le32 startblk_lo;	/* starting block number (if flat inodes) */
	__le32 rdev;		/* device ID (if special inodes) */
	struct erofs_inode_chunk_info c;
};

union erofs_inode_i_nb {
	__le16 nlink;		/* if EROFS_I_NLINK_1_BIT is unset */
	__le16 blocks_hi;	/* total blocks count MSB */
	__le16 startblk_hi;	/* starting block number MSB */
};

/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
	__le16 i_format;	/* inode format hints */

/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
	__le16 i_xattr_icount;
	__le16 i_mode;
	__le16 i_nlink;
	union erofs_inode_i_nb i_nb;
	__le32 i_size;
	__le32 i_reserved;
	__le32 i_mtime;
	union erofs_inode_i_u i_u;

	__le32 i_ino;		/* only used for 32-bit stat compatibility */
	__le16 i_uid;
	__le16 i_gid;
	__le32 i_reserved2;
	__le32 i_reserved;
};

/* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended {
	__le16 i_format;	/* inode format hints */

/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
	__le16 i_xattr_icount;
	__le16 i_mode;
	__le16 i_reserved;
	union erofs_inode_i_nb i_nb;
	__le64 i_size;
	union erofs_inode_i_u i_u;

@@ -248,6 +240,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
	if (!i_xattr_icount)
		return 0;

	/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
	return sizeof(struct erofs_xattr_ibody_header) +
		sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
}
@@ -266,11 +259,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE	sizeof(__le32)

/* 8-byte inode chunk indexes */
/* 8-byte inode chunk index */
struct erofs_inode_chunk_index {
	__le16 advise;		/* always 0, don't care for now */
	__le16 startblk_hi;	/* starting block number MSB */
	__le16 device_id;	/* back-end storage id (with bits masked) */
	__le32 blkaddr;		/* start block address of this inode chunk */
	__le32 startblk_lo;	/* starting block number of this chunk */
};

/* dirent sorts in alphabet order, thus we can do binary search */
+3 −3
Original line number Diff line number Diff line
@@ -108,7 +108,7 @@ static int erofs_read_inode(struct inode *inode)
		iu = dic->i_u;
		i_uid_write(inode, le16_to_cpu(dic->i_uid));
		i_gid_write(inode, le16_to_cpu(dic->i_gid));
		set_nlink(inode, le16_to_cpu(dic->i_nlink));
		set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
		inode_set_mtime(inode, sbi->build_time, sbi->build_time_nsec);

		inode->i_size = le32_to_cpu(dic->i_size);
@@ -129,7 +129,7 @@ static int erofs_read_inode(struct inode *inode)
	case S_IFREG:
	case S_IFDIR:
	case S_IFLNK:
		vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
		vi->startblk = le32_to_cpu(iu.startblk_lo);
		if(S_ISLNK(inode->i_mode)) {
			err = erofs_fill_symlink(inode, ptr, ofs);
			if (err)
@@ -152,7 +152,7 @@ static int erofs_read_inode(struct inode *inode)
	}

	if (erofs_inode_is_data_compressed(vi->datalayout))
		inode->i_blocks = le32_to_cpu(iu.compressed_blocks) <<
		inode->i_blocks = le32_to_cpu(iu.blocks_lo) <<
					(sb->s_blocksize_bits - 9);
	else
		inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+3 −3
Original line number Diff line number Diff line
@@ -47,8 +47,8 @@ struct erofs_device_info {
	struct dax_device *dax_dev;
	u64 dax_part_off;

	u32 blocks;
	u32 mapped_blkaddr;
	erofs_blk_t blocks;
	erofs_blk_t uniaddr;
};

enum {
@@ -252,7 +252,7 @@ struct erofs_inode {
	unsigned int *xattr_shared_xattrs;

	union {
		erofs_blk_t raw_blkaddr;
		erofs_blk_t startblk;
		struct {
			unsigned short	chunkformat;
			unsigned char	chunkbits;
+6 −6
Original line number Diff line number Diff line
@@ -178,8 +178,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
		dif->file = file;
	}

	dif->blocks = le32_to_cpu(dis->blocks);
	dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
	dif->blocks = le32_to_cpu(dis->blocks_lo);
	dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
	sbi->total_blocks += dif->blocks;
	*pos += EROFS_DEVT_SLOT_SIZE;
	return 0;
@@ -299,7 +299,7 @@ static int erofs_read_superblock(struct super_block *sb)
			  sbi->sb_size);
		goto out;
	}
	sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
	sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
	sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -308,12 +308,12 @@ static int erofs_read_superblock(struct super_block *sb)
	sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
	sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
	sbi->root_nid = le16_to_cpu(dsb->root_nid);
	sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
	sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
	sbi->inos = le64_to_cpu(dsb->inos);

	sbi->build_time = le64_to_cpu(dsb->build_time);
	sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
	sbi->build_time = le64_to_cpu(dsb->epoch);
	sbi->build_time_nsec = le32_to_cpu(dsb->fixed_nsec);

	super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));