Commit 9e355113 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull misc vfs updates from Christian Brauner:
 "This contains a mix of VFS cleanups, performance improvements, API
  fixes, documentation, and a deprecation notice.

  Scalability and performance:

   - Rework pid allocation to only take pidmap_lock once instead of
     twice during alloc_pid(), improving thread creation/teardown
     throughput by 10-16% depending on false-sharing luck. Pad the
     namespace refcount to reduce false-sharing

   - Track file lock presence via a flag in ->i_opflags instead of
     reading ->i_flctx, avoiding false-sharing with ->i_readcount on
     open/close hot paths. Measured 4-16% improvement on 24-core
     open-in-a-loop benchmarks

   - Use a consume fence in locks_inode_context() to match the
     store-release/load-consume idiom, eliminating a hardware fence on
     some architectures

   - Annotate cdev_lock with __cacheline_aligned_in_smp to prevent
     false-sharing

   - Remove a redundant DCACHE_MANAGED_DENTRY check in
     __follow_mount_rcu() that never fires since the caller already
     verifies it, eliminating a 100% mispredicted branch

   - Fix a 100% mispredicted likely() in devcgroup_inode_permission()
     that became wrong after a prior code reorder

  Bug fixes and correctness:

   - Make insert_inode_locked() wait for inode destruction instead of
     skipping, fixing a corner case where two matching inodes could
     exist in the hash

   - Move f_mode initialization before file_ref_init() in alloc_file()
     to respect the SLAB_TYPESAFE_BY_RCU ordering contract

   - Add a WARN_ON_ONCE guard in try_to_free_buffers() for folios with
     no buffers attached, preventing a null pointer dereference when
     AS_RELEASE_ALWAYS is set but no release_folio op exists

   - Fix select restart_block to store end_time as timespec64, avoiding
     truncation of tv_sec on 32-bit architectures

   - Make dump_inode() use get_kernel_nofault() to safely access inode
     and superblock fields, matching the dump_mapping() pattern

  API modernization:

   - Make posix_acl_to_xattr() allocate the buffer internally since
     every single caller was doing it anyway. Reduces boilerplate and
     unnecessary error checking across ~15 filesystems

   - Replace deprecated simple_strtoul() with kstrtoul() for the
     ihash_entries, dhash_entries, mhash_entries, and mphash_entries
     boot parameters, adding proper error handling

   - Convert chardev code to use guard(mutex) and __free(kfree) cleanup
     patterns

   - Replace min_t() with min() or umin() in VFS code to avoid silently
     truncating unsigned long to unsigned int

   - Gate LOOKUP_RCU assertions behind CONFIG_DEBUG_VFS since callers
     already check the flag

  Deprecation:

   - Begin deprecating legacy BSD process accounting (acct(2)). The
     interface has numerous footguns and better alternatives exist
     (eBPF)

  Documentation:

   - Fix and complete kernel-doc for struct export_operations, removing
     duplicated documentation between ReST and source

   - Fix kernel-doc warnings for __start_dirop() and ilookup5_nowait()

  Testing:

   - Add a kunit test for initramfs cpio handling of entries with
     filesize > PATH_MAX

  Misc:

   - Add missing <linux/init_task.h> include in fs_struct.c"

* tag 'vfs-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (28 commits)
  posix_acl: make posix_acl_to_xattr() alloc the buffer
  fs: make insert_inode_locked() wait for inode destruction
  initramfs_test: kunit test for cpio.filesize > PATH_MAX
  fs: improve dump_inode() to safely access inode fields
  fs: add <linux/init_task.h> for 'init_fs'
  docs: exportfs: Use source code struct documentation
  fs: move initializing f_mode before file_ref_init()
  exportfs: Complete kernel-doc for struct export_operations
  exportfs: Mark struct export_operations functions at kernel-doc
  exportfs: Fix kernel-doc output for get_name()
  acct(2): begin the deprecation of legacy BSD process accounting
  device_cgroup: remove branch hint after code refactor
  VFS: fix __start_dirop() kernel-doc warnings
  fs: Describe @isnew parameter in ilookup5_nowait()
  fs/namei: Remove redundant DCACHE_MANAGED_DENTRY check in __follow_mount_rcu
  fs: only assert on LOOKUP_RCU when built with CONFIG_DEBUG_VFS
  select: store end_time as timespec64 in restart block
  chardev: Switch to guard(mutex) and __free(kfree)
  namespace: Replace simple_strtoul with kstrtoul to parse boot params
  dcache: Replace simple_strtoul with kstrtoul in set_dhash_entries
  ...
parents 3304b3fe 6cbfdf89
Loading
Loading
Loading
Loading
+5 −37
Original line number Diff line number Diff line
@@ -119,43 +119,11 @@ For a filesystem to be exportable it must:

A file system implementation declares that instances of the filesystem
are exportable by setting the s_export_op field in the struct
super_block.  This field must point to a "struct export_operations"
struct which has the following members:

  encode_fh (mandatory)
    Takes a dentry and creates a filehandle fragment which may later be used
    to find or create a dentry for the same object.

  fh_to_dentry (mandatory)
    Given a filehandle fragment, this should find the implied object and
    create a dentry for it (possibly with d_obtain_alias).

  fh_to_parent (optional but strongly recommended)
    Given a filehandle fragment, this should find the parent of the
    implied object and create a dentry for it (possibly with
    d_obtain_alias).  May fail if the filehandle fragment is too small.

  get_parent (optional but strongly recommended)
    When given a dentry for a directory, this should return  a dentry for
    the parent.  Quite possibly the parent dentry will have been allocated
    by d_alloc_anon.  The default get_parent function just returns an error
    so any filehandle lookup that requires finding a parent will fail.
    ->lookup("..") is *not* used as a default as it can leave ".." entries
    in the dcache which are too messy to work with.

  get_name (optional)
    When given a parent dentry and a child dentry, this should find a name
    in the directory identified by the parent dentry, which leads to the
    object identified by the child dentry.  If no get_name function is
    supplied, a default implementation is provided which uses vfs_readdir
    to find potential names, and matches inode numbers to find the correct
    match.

  flags
    Some filesystems may need to be handled differently than others. The
    export_operations struct also includes a flags field that allows the
    filesystem to communicate such information to nfsd. See the Export
    Operations Flags section below for more explanation.
super_block.  This field must point to a struct export_operations
which has the following members:

.. kernel-doc:: include/linux/exportfs.h
   :identifiers: struct export_operations

A filehandle fragment consists of an array of 1 or more 4byte words,
together with a one byte "type".
+3 −13
Original line number Diff line number Diff line
@@ -167,17 +167,11 @@ int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
		if (retval)
			goto err_out;

		size = posix_acl_xattr_size(acl->a_count);

		value = kzalloc(size, GFP_NOFS);
		value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS);
		if (!value) {
			retval = -ENOMEM;
			goto err_out;
		}

		retval = posix_acl_to_xattr(&init_user_ns, acl, value, size);
		if (retval < 0)
			goto err_out;
	}

	/*
@@ -257,13 +251,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
		return 0;

	/* Set a setxattr request to server */
	size = posix_acl_xattr_size(acl->a_count);
	buffer = kmalloc(size, GFP_KERNEL);
	buffer = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
	if (!buffer)
		return -ENOMEM;
	retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
	if (retval < 0)
		goto err_free_out;

	switch (type) {
	case ACL_TYPE_ACCESS:
		name = XATTR_NAME_POSIX_ACL_ACCESS;
@@ -275,7 +266,6 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
		BUG();
	}
	retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0);
err_free_out:
	kfree(buffer);
	return retval;
}
+3 −7
Original line number Diff line number Diff line
@@ -57,7 +57,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
		    struct posix_acl *acl, int type)
{
	int ret, size = 0;
	int ret;
	size_t size = 0;
	const char *name;
	char AUTO_KFREE(value);

@@ -77,20 +78,15 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
	if (acl) {
		unsigned int nofs_flag;

		size = posix_acl_xattr_size(acl->a_count);
		/*
		 * We're holding a transaction handle, so use a NOFS memory
		 * allocation context to avoid deadlock if reclaim happens.
		 */
		nofs_flag = memalloc_nofs_save();
		value = kmalloc(size, GFP_KERNEL);
		value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL);
		memalloc_nofs_restore(nofs_flag);
		if (!value)
			return -ENOMEM;

		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
		if (ret < 0)
			return ret;
	}

	if (trans)
+5 −1
Original line number Diff line number Diff line
@@ -2354,7 +2354,7 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
	if (!head)
		return false;
	blocksize = head->b_size;
	to = min_t(unsigned, folio_size(folio) - from, count);
	to = min(folio_size(folio) - from, count);
	to = from + to;
	if (from < blocksize && to > folio_size(folio) - blocksize)
		return false;
@@ -2948,6 +2948,10 @@ bool try_to_free_buffers(struct folio *folio)
	if (folio_test_writeback(folio))
		return false;

	/* Misconfigured folio check */
	if (WARN_ON_ONCE(!folio_buffers(folio)))
		return true;

	if (mapping == NULL) {		/* can this still happen? */
		ret = drop_buffers(folio, &buffers_to_free);
		goto out;
+22 −28
Original line number Diff line number Diff line
@@ -90,7 +90,8 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu)
int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
		 struct posix_acl *acl, int type)
{
	int ret = 0, size = 0;
	int ret = 0;
	size_t size = 0;
	const char *name = NULL;
	char *value = NULL;
	struct iattr newattrs;
@@ -126,16 +127,11 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
	}

	if (acl) {
		size = posix_acl_xattr_size(acl->a_count);
		value = kmalloc(size, GFP_NOFS);
		value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS);
		if (!value) {
			ret = -ENOMEM;
			goto out;
		}

		ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
		if (ret < 0)
			goto out_free;
	}

	if (new_mode != old_mode) {
@@ -172,7 +168,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
	struct posix_acl *acl, *default_acl;
	size_t val_size1 = 0, val_size2 = 0;
	struct ceph_pagelist *pagelist = NULL;
	void *tmp_buf = NULL;
	void *tmp_buf1 = NULL, *tmp_buf2 = NULL;
	int err;

	err = posix_acl_create(dir, mode, &default_acl, &acl);
@@ -192,15 +188,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
	if (!default_acl && !acl)
		return 0;

	if (acl)
		val_size1 = posix_acl_xattr_size(acl->a_count);
	if (default_acl)
		val_size2 = posix_acl_xattr_size(default_acl->a_count);

	err = -ENOMEM;
	tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
	if (!tmp_buf)
		goto out_err;
	pagelist = ceph_pagelist_alloc(GFP_KERNEL);
	if (!pagelist)
		goto out_err;
@@ -213,34 +201,39 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,

	if (acl) {
		size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);

		err = -ENOMEM;
		tmp_buf1 = posix_acl_to_xattr(&init_user_ns, acl,
					      &val_size1, GFP_KERNEL);
		if (!tmp_buf1)
			goto out_err;
		err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
		if (err)
			goto out_err;
		ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
					    len);
		err = posix_acl_to_xattr(&init_user_ns, acl,
					 tmp_buf, val_size1);
		if (err < 0)
			goto out_err;
		ceph_pagelist_encode_32(pagelist, val_size1);
		ceph_pagelist_append(pagelist, tmp_buf, val_size1);
		ceph_pagelist_append(pagelist, tmp_buf1, val_size1);
	}
	if (default_acl) {
		size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);

		err = -ENOMEM;
		tmp_buf2 = posix_acl_to_xattr(&init_user_ns, default_acl,
					      &val_size2, GFP_KERNEL);
		if (!tmp_buf2)
			goto out_err;
		err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
		if (err)
			goto out_err;
		ceph_pagelist_encode_string(pagelist,
					  XATTR_NAME_POSIX_ACL_DEFAULT, len);
		err = posix_acl_to_xattr(&init_user_ns, default_acl,
					 tmp_buf, val_size2);
		if (err < 0)
			goto out_err;
		ceph_pagelist_encode_32(pagelist, val_size2);
		ceph_pagelist_append(pagelist, tmp_buf, val_size2);
		ceph_pagelist_append(pagelist, tmp_buf2, val_size2);
	}

	kfree(tmp_buf);
	kfree(tmp_buf1);
	kfree(tmp_buf2);

	as_ctx->acl = acl;
	as_ctx->default_acl = default_acl;
@@ -250,7 +243,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
out_err:
	posix_acl_release(acl);
	posix_acl_release(default_acl);
	kfree(tmp_buf);
	kfree(tmp_buf1);
	kfree(tmp_buf2);
	if (pagelist)
		ceph_pagelist_release(pagelist);
	return err;
Loading