Commit 69a3a0a4 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull erofs updates from Gao Xiang:
 "In this cycle, we add file-backed mount support, which has has been a
  strong requirement for years. It is especially useful when there are
  thousands of images running on the same host for containers and other
  sandbox use cases, unlike OS image use cases.

  Without file-backed mounts, it's hard for container runtimes to manage
  and isolate so many unnecessary virtual block devices safely and
  efficiently, therefore file-backed mounts are highly preferred. For
  EROFS users, ComposeFS [1], containerd, and Android APEXes [2] will
  directly benefit from it, and I've seen no risk in implementing it as
  a completely immutable filesystem.

  The previous experimental feature "EROFS over fscache" is now marked
  as deprecated because:

   - Fscache is no longer an independent subsystem and has been merged
     into netfs, which was somewhat unexpected when it was proposed.

   - New HSM "fanotify pre-content hooks" [3] will be landed upstream.
     These hooks will replace "EROFS over fscache" in a simpler way, as
     EROFS won't be bother with kernel caching anymore. Userspace
     programs can also manage their own caching hierarchy more flexibly.

  Once the HSM "fanotify pre-content hooks" is landed, I will remove the
  fscache backend entirely as an internal dependency cleanup. More
  backgrounds are listed in the original patchset [4].

  In addition to that, there are bugfixes and cleanups as usual.

  Summary:

   - Support file-backed mounts for containers and sandboxes

   - Mark the experimental fscache backend as deprecated

   - Handle overlapped pclusters caused by crafted images properly

   - Fix a failure path which could cause infinite loops in
     z_erofs_init_decompressor()

   - Get rid of unnecessary NOFAILs

   - Harmless on-disk hardening & minor cleanups"

* tag 'erofs-for-6.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: reject inodes with negative i_size
  erofs: restrict pcluster size limitations
  erofs: allocate more short-lived pages from reserved pool first
  erofs: sunset unneeded NOFAILs
  erofs: simplify erofs_map_blocks_flatmode()
  erofs: refactor read_inode calling convention
  erofs: use kmemdup_nul in erofs_fill_symlink
  erofs: mark experimental fscache backend deprecated
  erofs: support compressed inodes for fileio
  erofs: support unencoded inodes for fileio
  erofs: add file-backed mount support
  erofs: handle overlapped pclusters out of crafted images properly
  erofs: fix error handling in z_erofs_init_decompressor
  erofs: clean up erofs_register_sysfs()
  erofs: fix incorrect symlink detection in fast symlink
parents 7a40974f 025497e1
Loading
Loading
Loading
Loading
+21 −1
Original line number Diff line number Diff line
@@ -74,6 +74,23 @@ config EROFS_FS_SECURITY

	  If you are not using a security module, say N.

config EROFS_FS_BACKED_BY_FILE
	bool "File-backed EROFS filesystem support"
	depends on EROFS_FS
	default y
	help
	  This allows EROFS to use filesystem image files directly, without
	  the intercession of loopback block devices or likewise. It is
	  particularly useful for container images with numerous blobs and
	  other sandboxes, where loop devices behave intricately.  It can also
	  be used to simplify error-prone lifetime management of unnecessary
	  virtual block devices.

	  Note that this feature, along with ongoing fanotify pre-content
	  hooks, will eventually replace "EROFS over fscache."

	  If you don't want to enable this feature, say N.

config EROFS_FS_ZIP
	bool "EROFS Data Compression Support"
	depends on EROFS_FS
@@ -128,7 +145,7 @@ config EROFS_FS_ZIP_ZSTD
	  If unsure, say N.

config EROFS_FS_ONDEMAND
	bool "EROFS fscache-based on-demand read support"
	bool "EROFS fscache-based on-demand read support (deprecated)"
	depends on EROFS_FS
	select NETFS_SUPPORT
	select FSCACHE
@@ -138,6 +155,9 @@ config EROFS_FS_ONDEMAND
	  This permits EROFS to use fscache-backed data blobs with on-demand
	  read support.

	  It is now deprecated and scheduled to be removed from the kernel
	  after fanotify pre-content hooks are landed.

	  If unsure, say N.

config EROFS_FS_PCPU_KTHREAD
+1 −0
Original line number Diff line number Diff line
@@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+74 −35
Original line number Diff line number Diff line
@@ -59,8 +59,12 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,

void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
	if (erofs_is_fscache_mode(sb))
		buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
	struct erofs_sb_info *sbi = EROFS_SB(sb);

	if (erofs_is_fileio_mode(sbi))
		buf->mapping = file_inode(sbi->fdev)->i_mapping;
	else if (erofs_is_fscache_mode(sb))
		buf->mapping = sbi->s_fscache->inode->i_mapping;
	else
		buf->mapping = sb->s_bdev->bd_mapping;
}
@@ -75,38 +79,28 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
static int erofs_map_blocks_flatmode(struct inode *inode,
				     struct erofs_map_blocks *map)
{
	erofs_blk_t nblocks, lastblk;
	u64 offset = map->m_la;
	struct erofs_inode *vi = EROFS_I(inode);
	struct super_block *sb = inode->i_sb;
	bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
	erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;

	nblocks = erofs_iblks(inode);
	lastblk = nblocks - tailendpacking;

	/* there is no hole in flatmode */
	map->m_flags = EROFS_MAP_MAPPED;
	if (offset < erofs_pos(sb, lastblk)) {
	map->m_flags = EROFS_MAP_MAPPED;	/* no hole in flat inodes */
	if (map->m_la < erofs_pos(sb, lastblk)) {
		map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
		map->m_plen = erofs_pos(sb, lastblk) - offset;
	} else if (tailendpacking) {
		map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
	} else {
		DBG_BUGON(!tailendpacking);
		map->m_pa = erofs_iloc(inode) + vi->inode_isize +
			vi->xattr_isize + erofs_blkoff(sb, offset);
		map->m_plen = inode->i_size - offset;
			vi->xattr_isize + erofs_blkoff(sb, map->m_la);
		map->m_plen = inode->i_size - map->m_la;

		/* inline data should be located in the same meta block */
		if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
			erofs_err(sb, "inline data cross block boundary @ nid %llu",
				  vi->nid);
			erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
			DBG_BUGON(1);
			return -EFSCORRUPTED;
		}
		map->m_flags |= EROFS_MAP_META;
	} else {
		erofs_err(sb, "internal error @ nid: %llu (size %llu), m_la 0x%llx",
			  vi->nid, inode->i_size, map->m_la);
		DBG_BUGON(1);
		return -EIO;
	}
	return 0;
}
@@ -128,7 +122,7 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
	if (map->m_la >= inode->i_size) {
		/* leave out-of-bound access unmapped */
		map->m_flags = 0;
		map->m_plen = 0;
		map->m_plen = map->m_llen;
		goto out;
	}

@@ -189,16 +183,34 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
	return err;
}

static void erofs_fill_from_devinfo(struct erofs_map_dev *map,
				    struct erofs_device_info *dif)
{
	map->m_bdev = NULL;
	map->m_fp = NULL;
	if (dif->file) {
		if (S_ISBLK(file_inode(dif->file)->i_mode))
			map->m_bdev = file_bdev(dif->file);
		else
			map->m_fp = dif->file;
	}
	map->m_daxdev = dif->dax_dev;
	map->m_dax_part_off = dif->dax_part_off;
	map->m_fscache = dif->fscache;
}

int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
	struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
	struct erofs_device_info *dif;
	erofs_off_t startoff, length;
	int id;

	map->m_bdev = sb->s_bdev;
	map->m_daxdev = EROFS_SB(sb)->dax_dev;
	map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
	map->m_fscache = EROFS_SB(sb)->s_fscache;
	map->m_fp = EROFS_SB(sb)->fdev;

	if (map->m_deviceid) {
		down_read(&devs->rwsem);
@@ -212,29 +224,20 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
			up_read(&devs->rwsem);
			return 0;
		}
		map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
		map->m_daxdev = dif->dax_dev;
		map->m_dax_part_off = dif->dax_part_off;
		map->m_fscache = dif->fscache;
		erofs_fill_from_devinfo(map, dif);
		up_read(&devs->rwsem);
	} else if (devs->extra_devices && !devs->flatdev) {
		down_read(&devs->rwsem);
		idr_for_each_entry(&devs->tree, dif, id) {
			erofs_off_t startoff, length;

			if (!dif->mapped_blkaddr)
				continue;

			startoff = erofs_pos(sb, dif->mapped_blkaddr);
			length = erofs_pos(sb, dif->blocks);

			if (map->m_pa >= startoff &&
			    map->m_pa < startoff + length) {
				map->m_pa -= startoff;
				map->m_bdev = dif->bdev_file ?
					      file_bdev(dif->bdev_file) : NULL;
				map->m_daxdev = dif->dax_dev;
				map->m_dax_part_off = dif->dax_part_off;
				map->m_fscache = dif->fscache;
				erofs_fill_from_devinfo(map, dif);
				break;
			}
		}
@@ -243,6 +246,42 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
	return 0;
}

/*
 * bit 30: I/O error occurred on this folio
 * bit 0 - 29: remaining parts to complete this folio
 */
#define EROFS_ONLINEFOLIO_EIO			(1 << 30)

void erofs_onlinefolio_init(struct folio *folio)
{
	union {
		atomic_t o;
		void *v;
	} u = { .o = ATOMIC_INIT(1) };

	folio->private = u.v;	/* valid only if file-backed folio is locked */
}

void erofs_onlinefolio_split(struct folio *folio)
{
	atomic_inc((atomic_t *)&folio->private);
}

void erofs_onlinefolio_end(struct folio *folio, int err)
{
	int orig, v;

	do {
		orig = atomic_read((atomic_t *)&folio->private);
		v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
	} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);

	if (v & ~EROFS_ONLINEFOLIO_EIO)
		return;
	folio->private = 0;
	folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
}

static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
		unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
@@ -392,7 +431,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}

/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
const struct address_space_operations erofs_aops = {
	.read_folio = erofs_read_folio,
	.readahead = erofs_readahead,
	.bmap = erofs_bmap,
+1 −1
Original line number Diff line number Diff line
@@ -539,7 +539,7 @@ int __init z_erofs_init_decompressor(void)
	for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) {
		err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0;
		if (err) {
			while (--i)
			while (i--)
				if (z_erofs_decomp[i])
					z_erofs_decomp[i]->exit();
			return err;
+4 −1
Original line number Diff line number Diff line
@@ -288,9 +288,12 @@ struct erofs_dirent {

#define EROFS_NAME_LEN      255

/* maximum supported size of a physical compression cluster */
/* maximum supported encoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE	(1024 * 1024)

/* maximum supported decoded size of a physical compressed cluster */
#define Z_EROFS_PCLUSTER_MAX_DSIZE	(12 * 1024 * 1024)

/* available compression algorithm types (for h_algorithmtype) */
enum {
	Z_EROFS_COMPRESSION_LZ4		= 0,
Loading