Commit 38938540 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull erofs updates from Gao Xiang:
 "In this cycle, inode page cache sharing among filesystems on the same
  machine is now supported, which is particularly useful for
  high-density hosts running tens of thousands of containers.

  In addition, we fully isolate the EROFS core on-disk format from other
  optional encoded layouts since the core on-disk part is designed to be
  simple, effective, and secure. Users can use the core format to build
  unique golden immutable images and import their filesystem trees
  directly from raw block devices via DMA, page-mapped DAX devices,
  and/or file-backed mounts without having to worry about unnecessary
  intrinsic consistency issues found in other generic filesystems by
  design. However, the full vision is still working in progress and will
  spend more time to achieve final goals.

  There are other improvements and bug fixes as usual, as listed below:

   - Support inode page cache sharing among filesystems

   - Formally separate optional encoded (aka compressed) inode layouts
     (and the implementations) from the EROFS core on-disk aligned plain
     format for future zero-trust security usage

   - Improve performance by caching the fact that an inode does not have
     a POSIX ACL

   - Improve LZ4 decompression error reporting

   - Enable LZMA by default and promote DEFLATE and Zstandard algorithms
     out of EXPERIMENTAL status

   - Switch to inode_set_cached_link() to cache symlink lengths

   - random bugfixes and minor cleanups"

* tag 'erofs-for-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs: (31 commits)
  erofs: fix UAF issue for file-backed mounts w/ directio option
  erofs: update compression algorithm status
  erofs: fix inline data read failure for ztailpacking pclusters
  erofs: avoid some unnecessary #ifdefs
  erofs: handle end of filesystem properly for file-backed mounts
  erofs: separate plain and compressed filesystems formally
  erofs: use inode_set_cached_link()
  erofs: mark inodes without acls in erofs_read_inode()
  erofs: implement .fadvise for page cache share
  erofs: support compressed inodes for page cache share
  erofs: support unencoded inodes for page cache share
  erofs: pass inode to trace_erofs_read_folio
  erofs: introduce the page cache share feature
  erofs: using domain_id in the safer way
  erofs: add erofs_inode_set_aops helper to set the aops
  erofs: support user-defined fingerprint name
  erofs: decouple `struct erofs_anon_fs_type`
  fs: Export alloc_empty_backing_file
  erofs: tidy up erofs_init_inode_xattrs()
  erofs: add missing documentation about `directio` mount option
  ...
parents 4fb7d86f 1caf50ce
Loading
Loading
Loading
Loading
+12 −8
Original line number Diff line number Diff line
@@ -3,19 +3,23 @@ Date: November 2021
Contact:	"Huang Jianan" <huangjianan@oppo.com>
Description:	Shows all enabled kernel features.
		Supported features:
		zero_padding, compr_cfgs, big_pcluster, chunked_file,
		device_table, compr_head2, sb_chksum, ztailpacking,
		dedupe, fragments, 48bit, metabox.
		compr_cfgs, big_pcluster, chunked_file, device_table,
		compr_head2, sb_chksum, ztailpacking, dedupe, fragments,
		48bit, metabox.

What:		/sys/fs/erofs/<disk>/sync_decompress
Date:		November 2021
Contact:	"Huang Jianan" <huangjianan@oppo.com>
Description:	Control strategy of sync decompression:
Description:	Control strategy of synchronous decompression. Synchronous
		decompression tries to decompress in the reader thread for
		synchronous reads and small asynchronous reads (<= 12 KiB):

		- 0 (default, auto): enable for readpage, and enable for
		  readahead on atomic contexts only.
		- 1 (force on): enable for readpage and readahead.
		- 2 (force off): disable for all situations.
		- 0 (auto, default): apply to synchronous reads only, but will
		                     switch to 1 (force on) if any decompression
		                     request is detected in atomic contexts;
		- 1 (force on): apply to synchronous reads and small
		                asynchronous reads;
		- 2 (force off): disable synchronous decompression completely.

What:		/sys/fs/erofs/<disk>/drop_caches
Date:		November 2024
+13 −5
Original line number Diff line number Diff line
@@ -63,9 +63,9 @@ Here are the main features of EROFS:
 - Support POSIX.1e ACLs by using extended attributes;

 - Support transparent data compression as an option:
   LZ4, MicroLZMA and DEFLATE algorithms can be used on a per-file basis; In
   addition, inplace decompression is also supported to avoid bounce compressed
   buffers and unnecessary page cache thrashing.
   LZ4, MicroLZMA, DEFLATE and Zstandard algorithms can be used on a per-file
   basis; In addition, inplace decompression is also supported to avoid bounce
   compressed buffers and unnecessary page cache thrashing.

 - Support chunk-based data deduplication and rolling-hash compressed data
   deduplication;
@@ -125,10 +125,18 @@ dax={always,never} Use direct access (no page cache). See
                       Documentation/filesystems/dax.rst.
dax                    A legacy option which is an alias for ``dax=always``.
device=%s              Specify a path to an extra device to be used together.
directio               (For file-backed mounts) Use direct I/O to access backing
                       files, and asynchronous I/O will be enabled if supported.
fsid=%s                Specify a filesystem image ID for Fscache back-end.
domain_id=%s           Specify a domain ID in fscache mode so that different images
                       with the same blobs under a given domain ID can share storage.
domain_id=%s           Specify a trusted domain ID for fscache mode so that
                       different images with the same blobs, identified by blob IDs,
                       can share storage within the same trusted domain.
                       Also used for different filesystems with inode page sharing
                       enabled to share page cache within the trusted domain.
fsoffset=%llu          Specify block-aligned filesystem offset for the primary device.
inode_share            Enable inode page sharing for this filesystem.  Inodes with
                       identical content within the same domain ID can share the
                       page cache.
===================    =========================================================

Sysfs Entries
+12 −8
Original line number Diff line number Diff line
@@ -112,13 +112,14 @@ config EROFS_FS_ZIP
config EROFS_FS_ZIP_LZMA
	bool "EROFS LZMA compressed data support"
	depends on EROFS_FS_ZIP
	default y
	help
	  Saying Y here includes support for reading EROFS file systems
	  containing LZMA compressed data, specifically called microLZMA. It
	  gives better compression ratios than the default LZ4 format, at the
	  expense of more CPU overhead.

	  If unsure, say N.
	  Say N if you want to disable LZMA compression support.

config EROFS_FS_ZIP_DEFLATE
	bool "EROFS DEFLATE compressed data support"
@@ -129,9 +130,6 @@ config EROFS_FS_ZIP_DEFLATE
	  ratios than the default LZ4 format, while it costs more CPU
	  overhead.

	  DEFLATE support is an experimental feature for now and so most
	  file systems will be readable without selecting this option.

	  If unsure, say N.

config EROFS_FS_ZIP_ZSTD
@@ -141,10 +139,7 @@ config EROFS_FS_ZIP_ZSTD
	  Saying Y here includes support for reading EROFS file systems
	  containing Zstandard compressed data.  It gives better compression
	  ratios than the default LZ4 format, while it costs more CPU
	  overhead.

	  Zstandard support is an experimental feature for now and so most
	  file systems will be readable without selecting this option.
	  overhead and memory footprint.

	  If unsure, say N.

@@ -194,3 +189,12 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI
	  at higher priority.

	  If unsure, say N.

config EROFS_FS_PAGE_CACHE_SHARE
	bool "EROFS page cache share support (experimental)"
	depends on EROFS_FS && EROFS_FS_XATTR && !EROFS_FS_ONDEMAND
	help
	  This enables page cache sharing among inodes with identical
	  content fingerprints on the same machine.

	  If unsure, say N.
+1 −0
Original line number Diff line number Diff line
@@ -10,3 +10,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += ishare.o
+32 −24
Original line number Diff line number Diff line
@@ -270,6 +270,7 @@ void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
struct erofs_iomap_iter_ctx {
	struct page *page;
	void *base;
	struct inode *realinode;
};

static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -277,14 +278,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
{
	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
	struct erofs_iomap_iter_ctx *ctx = iter->private;
	struct super_block *sb = inode->i_sb;
	struct inode *realinode = ctx ? ctx->realinode : inode;
	struct super_block *sb = realinode->i_sb;
	struct erofs_map_blocks map;
	struct erofs_map_dev mdev;
	int ret;

	map.m_la = offset;
	map.m_llen = length;
	ret = erofs_map_blocks(inode, &map);
	ret = erofs_map_blocks(realinode, &map);
	if (ret < 0)
		return ret;

@@ -297,7 +299,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
		return 0;
	}

	if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
	if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(realinode)) {
		mdev = (struct erofs_map_dev) {
			.m_deviceid = map.m_deviceid,
			.m_pa = map.m_pa,
@@ -323,7 +325,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
			void *ptr;

			ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
						 erofs_inode_in_metabox(inode));
						 erofs_inode_in_metabox(realinode));
			if (IS_ERR(ptr))
				return PTR_ERR(ptr);
			iomap->inline_data = ptr;
@@ -364,12 +366,10 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		 u64 start, u64 len)
{
	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
		if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
			return -EOPNOTSUPP;
		return iomap_fiemap(inode, fieinfo, start, len,
				    &z_erofs_iomap_report_ops);
#else
		return -EOPNOTSUPP;
#endif
	}
	return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}
@@ -384,11 +384,15 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
		.ops		= &iomap_bio_read_ops,
		.cur_folio	= folio,
	};
	struct erofs_iomap_iter_ctx iter_ctx = {};

	trace_erofs_read_folio(folio, true);
	bool need_iput;
	struct erofs_iomap_iter_ctx iter_ctx = {
		.realinode = erofs_real_inode(folio_inode(folio), &need_iput),
	};

	trace_erofs_read_folio(iter_ctx.realinode, folio, true);
	iomap_read_folio(&erofs_iomap_ops, &read_ctx, &iter_ctx);
	if (need_iput)
		iput(iter_ctx.realinode);
	return 0;
}

@@ -398,12 +402,16 @@ static void erofs_readahead(struct readahead_control *rac)
		.ops		= &iomap_bio_read_ops,
		.rac		= rac,
	};
	struct erofs_iomap_iter_ctx iter_ctx = {};
	bool need_iput;
	struct erofs_iomap_iter_ctx iter_ctx = {
		.realinode = erofs_real_inode(rac->mapping->host, &need_iput),
	};

	trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
	trace_erofs_readahead(iter_ctx.realinode, readahead_index(rac),
			      readahead_count(rac), true);

	iomap_readahead(&erofs_iomap_ops, &read_ctx, &iter_ctx);
	if (need_iput)
		iput(iter_ctx.realinode);
}

static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
@@ -419,12 +427,13 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
	if (!iov_iter_count(to))
		return 0;

#ifdef CONFIG_FS_DAX
	if (IS_DAX(inode))
	if (IS_ENABLED(CONFIG_FS_DAX) && IS_DAX(inode))
		return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif

	if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) {
		struct erofs_iomap_iter_ctx iter_ctx = {};
		struct erofs_iomap_iter_ctx iter_ctx = {
			.realinode = inode,
		};

		return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
				    NULL, 0, &iter_ctx, 0);
@@ -480,12 +489,11 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
	struct inode *inode = file->f_mapping->host;
	const struct iomap_ops *ops = &erofs_iomap_ops;

	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
#ifdef CONFIG_EROFS_FS_ZIP
		ops = &z_erofs_iomap_report_ops;
#else
	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
		if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP))
			return generic_file_llseek(file, offset, whence);
#endif
		ops = &z_erofs_iomap_report_ops;
	}

	if (whence == SEEK_HOLE)
		offset = iomap_seek_hole(inode, offset, ops);
Loading