Commit 157d3d6e authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfs-7.0-rc1.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs mount updates from Christian Brauner:

 - statmount: accept fd as a parameter

   Extend struct mnt_id_req with a file descriptor field and a new
   STATMOUNT_BY_FD flag. When set, statmount() returns mount information
   for the mount the fd resides on — including detached mounts
   (unmounted via umount2(MNT_DETACH)).

   For detached mounts the STATMOUNT_MNT_POINT and STATMOUNT_MNT_NS_ID
   mask bits are cleared since neither is meaningful. The capability
   check is skipped for STATMOUNT_BY_FD since holding an fd already
   implies prior access to the mount and equivalent information is
   available through fstatfs() and /proc/pid/mountinfo without
   privilege. Includes comprehensive selftests covering both attached
   and detached mount cases.

 - fs: Remove internal old mount API code (1 patch)

   Now that every in-tree filesystem has been converted to the new
   mount API, remove all the legacy shim code in fs_context.c that
   handled unconverted filesystems. This deletes ~280 lines including
   legacy_init_fs_context(), the legacy_fs_context struct, and
   associated wrappers. The mount(2) syscall path for userspace remains
   untouched. Documentation references to the legacy callbacks are
   cleaned up.

 - mount: add OPEN_TREE_NAMESPACE to open_tree()

   Container runtimes currently use CLONE_NEWNS to copy the caller's
   entire mount namespace — only to then pivot_root() and recursively
   unmount everything they just copied. With large mount tables and
   thousands of parallel container launches this creates significant
   contention on the namespace semaphore.

   OPEN_TREE_NAMESPACE copies only the specified mount tree (like
   OPEN_TREE_CLONE) but returns a mount namespace fd instead of a
   detached mount fd. The new namespace contains the copied tree mounted
   on top of a clone of the real rootfs.

   This functions as a combined unshare(CLONE_NEWNS) + pivot_root() in a
   single syscall. Works with user namespaces: an unshare(CLONE_NEWUSER)
   followed by OPEN_TREE_NAMESPACE creates a mount namespace owned by
   the new user namespace. Mount namespace file mounts are excluded from
   the copy to prevent cycles. Includes ~1000 lines of selftests"

* tag 'vfs-7.0-rc1.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  selftests/open_tree: add OPEN_TREE_NAMESPACE tests
  mount: add OPEN_TREE_NAMESPACE
  fs: Remove internal old mount API code
  selftests: statmount: tests for STATMOUNT_BY_FD
  statmount: accept fd as a parameter
  statmount: permission check should return EPERM
parents 8113b399 1bce1a66
Loading
Loading
Loading
Loading
+0 −8
Original line number Diff line number Diff line
@@ -180,7 +180,6 @@ prototypes::
	int (*freeze_fs) (struct super_block *);
	int (*unfreeze_fs) (struct super_block *);
	int (*statfs) (struct dentry *, struct kstatfs *);
	int (*remount_fs) (struct super_block *, int *, char *);
	void (*umount_begin) (struct super_block *);
	int (*show_options)(struct seq_file *, struct dentry *);
	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
@@ -204,7 +203,6 @@ sync_fs: read
freeze_fs:		write
unfreeze_fs:		write
statfs:			maybe(read)	(see below)
remount_fs:		write
umount_begin:		no
show_options:		no		(namespace_sem)
quota_read:		no		(see below)
@@ -229,8 +227,6 @@ file_system_type

prototypes::

	struct dentry *(*mount) (struct file_system_type *, int,
		       const char *, void *);
	void (*kill_sb) (struct super_block *);

locking rules:
@@ -238,13 +234,9 @@ locking rules:
=======		=========
ops		may block
=======		=========
mount		yes
kill_sb		yes
=======		=========

->mount() returns ERR_PTR or the root dentry; its superblock should be locked
on return.

->kill_sb() takes a write-locked superblock, does all shutdown work on it,
unlocks and drops the reference.

+0 −2
Original line number Diff line number Diff line
@@ -299,8 +299,6 @@ manage the filesystem context. They are as follows:
     On success it should return 0.  In the case of an error, it should return
     a negative error code.

     .. Note:: reconfigure is intended as a replacement for remount_fs.


Filesystem context Security
===========================
+2 −5
Original line number Diff line number Diff line
@@ -448,11 +448,8 @@ a file off.

**mandatory**

->get_sb() is gone.  Switch to use of ->mount().  Typically it's just
a matter of switching from calling ``get_sb_``... to ``mount_``... and changing
the function type.  If you were doing it manually, just switch from setting
->mnt_root to some pointer to returning that pointer.  On errors return
ERR_PTR(...).
->get_sb() and ->mount() are gone. Switch to using the new mount API. See
Documentation/filesystems/mount_api.rst for more details.

---

+3 −55
Original line number Diff line number Diff line
@@ -94,11 +94,9 @@ functions:

The passed struct file_system_type describes your filesystem.  When a
request is made to mount a filesystem onto a directory in your
namespace, the VFS will call the appropriate mount() method for the
specific filesystem.  New vfsmount referring to the tree returned by
->mount() will be attached to the mountpoint, so that when pathname
resolution reaches the mountpoint it will jump into the root of that
vfsmount.
namespace, the VFS will call the appropriate get_tree() method for the
specific filesystem.  See Documentation/filesystems/mount_api.rst
for more details.

You can see all filesystems that are registered to the kernel in the
file /proc/filesystems.
@@ -117,8 +115,6 @@ members are defined:
		int fs_flags;
		int (*init_fs_context)(struct fs_context *);
		const struct fs_parameter_spec *parameters;
		struct dentry *(*mount) (struct file_system_type *, int,
			const char *, void *);
		void (*kill_sb) (struct super_block *);
		struct module *owner;
		struct file_system_type * next;
@@ -151,10 +147,6 @@ members are defined:
	'struct fs_parameter_spec'.
	More info in Documentation/filesystems/mount_api.rst.

``mount``
	the method to call when a new instance of this filesystem should
	be mounted

``kill_sb``
	the method to call when an instance of this filesystem should be
	shut down
@@ -173,45 +165,6 @@ members are defined:
  s_lock_key, s_umount_key, s_vfs_rename_key, s_writers_key,
  i_lock_key, i_mutex_key, invalidate_lock_key, i_mutex_dir_key: lockdep-specific

The mount() method has the following arguments:

``struct file_system_type *fs_type``
	describes the filesystem, partly initialized by the specific
	filesystem code

``int flags``
	mount flags

``const char *dev_name``
	the device name we are mounting.

``void *data``
	arbitrary mount options, usually comes as an ASCII string (see
	"Mount Options" section)

The mount() method must return the root dentry of the tree requested by
caller.  An active reference to its superblock must be grabbed and the
superblock must be locked.  On failure it should return ERR_PTR(error).

The arguments match those of mount(2) and their interpretation depends
on filesystem type.  E.g. for block filesystems, dev_name is interpreted
as block device name, that device is opened and if it contains a
suitable filesystem image the method creates and initializes struct
super_block accordingly, returning its root dentry to caller.

->mount() may choose to return a subtree of existing filesystem - it
doesn't have to create a new one.  The main result from the caller's
point of view is a reference to dentry at the root of (sub)tree to be
attached; creation of new superblock is a common side effect.

The most interesting member of the superblock structure that the mount()
method fills in is the "s_op" field.  This is a pointer to a "struct
super_operations" which describes the next level of the filesystem
implementation.

For more information on mounting (and the new mount API), see
Documentation/filesystems/mount_api.rst.

The Superblock Object
=====================

@@ -244,7 +197,6 @@ filesystem. The following members are defined:
					enum freeze_wholder who);
		int (*unfreeze_fs) (struct super_block *);
		int (*statfs) (struct dentry *, struct kstatfs *);
		int (*remount_fs) (struct super_block *, int *, char *);
		void (*umount_begin) (struct super_block *);

		int (*show_options)(struct seq_file *, struct dentry *);
@@ -351,10 +303,6 @@ or bottom half).
``statfs``
	called when the VFS needs to get filesystem statistics.

``remount_fs``
	called when the filesystem is remounted.  This is called with
	the kernel lock held

``umount_begin``
	called when the VFS is unmounting a filesystem.

+3 −205
Original line number Diff line number Diff line
@@ -24,20 +24,6 @@
#include "mount.h"
#include "internal.h"

enum legacy_fs_param {
	LEGACY_FS_UNSET_PARAMS,
	LEGACY_FS_MONOLITHIC_PARAMS,
	LEGACY_FS_INDIVIDUAL_PARAMS,
};

struct legacy_fs_context {
	char			*legacy_data;	/* Data page for legacy filesystems */
	size_t			data_size;
	enum legacy_fs_param	param_type;
};

static int legacy_init_fs_context(struct fs_context *fc);

static const struct constant_table common_set_sb_flag[] = {
	{ "dirsync",	SB_DIRSYNC },
	{ "lazytime",	SB_LAZYTIME },
@@ -275,7 +261,6 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
				      unsigned int sb_flags_mask,
				      enum fs_context_purpose purpose)
{
	int (*init_fs_context)(struct fs_context *);
	struct fs_context *fc;
	int ret = -ENOMEM;

@@ -307,12 +292,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
		break;
	}

	/* TODO: Make all filesystems support this unconditionally */
	init_fs_context = fc->fs_type->init_fs_context;
	if (!init_fs_context)
		init_fs_context = legacy_init_fs_context;

	ret = init_fs_context(fc);
	ret = fc->fs_type->init_fs_context(fc);
	if (ret < 0)
		goto err_fc;
	fc->need_free = true;
@@ -376,8 +356,6 @@ void fc_drop_locked(struct fs_context *fc)
	deactivate_locked_super(sb);
}

static void legacy_fs_context_free(struct fs_context *fc);

/**
 * vfs_dup_fs_context - Duplicate a filesystem context.
 * @src_fc: The context to copy.
@@ -531,184 +509,6 @@ void put_fs_context(struct fs_context *fc)
}
EXPORT_SYMBOL(put_fs_context);

/*
 * Free the config for a filesystem that doesn't support fs_context.
 */
static void legacy_fs_context_free(struct fs_context *fc)
{
	struct legacy_fs_context *ctx = fc->fs_private;

	if (ctx) {
		if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
			kfree(ctx->legacy_data);
		kfree(ctx);
	}
}

/*
 * Duplicate a legacy config.
 */
static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
	struct legacy_fs_context *ctx;
	struct legacy_fs_context *src_ctx = src_fc->fs_private;

	ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
	if (!ctx)
		return -ENOMEM;

	if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
		ctx->legacy_data = kmemdup(src_ctx->legacy_data,
					   src_ctx->data_size, GFP_KERNEL);
		if (!ctx->legacy_data) {
			kfree(ctx);
			return -ENOMEM;
		}
	}

	fc->fs_private = ctx;
	return 0;
}

/*
 * Add a parameter to a legacy config.  We build up a comma-separated list of
 * options.
 */
static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
	struct legacy_fs_context *ctx = fc->fs_private;
	unsigned int size = ctx->data_size;
	size_t len = 0;
	int ret;

	ret = vfs_parse_fs_param_source(fc, param);
	if (ret != -ENOPARAM)
		return ret;

	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");

	switch (param->type) {
	case fs_value_is_string:
		len = 1 + param->size;
		fallthrough;
	case fs_value_is_flag:
		len += strlen(param->key);
		break;
	default:
		return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
			      param->key);
	}

	if (size + len + 2 > PAGE_SIZE)
		return invalf(fc, "VFS: Legacy: Cumulative options too large");
	if (strchr(param->key, ',') ||
	    (param->type == fs_value_is_string &&
	     memchr(param->string, ',', param->size)))
		return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
			      param->key);
	if (!ctx->legacy_data) {
		ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
		if (!ctx->legacy_data)
			return -ENOMEM;
	}

	if (size)
		ctx->legacy_data[size++] = ',';
	len = strlen(param->key);
	memcpy(ctx->legacy_data + size, param->key, len);
	size += len;
	if (param->type == fs_value_is_string) {
		ctx->legacy_data[size++] = '=';
		memcpy(ctx->legacy_data + size, param->string, param->size);
		size += param->size;
	}
	ctx->legacy_data[size] = '\0';
	ctx->data_size = size;
	ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
	return 0;
}

/*
 * Add monolithic mount data.
 */
static int legacy_parse_monolithic(struct fs_context *fc, void *data)
{
	struct legacy_fs_context *ctx = fc->fs_private;

	if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
		pr_warn("VFS: Can't mix monolithic and individual options\n");
		return -EINVAL;
	}

	ctx->legacy_data = data;
	ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
	if (!ctx->legacy_data)
		return 0;

	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
		return 0;
	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
}

/*
 * Get a mountable root with the legacy mount command.
 */
static int legacy_get_tree(struct fs_context *fc)
{
	struct legacy_fs_context *ctx = fc->fs_private;
	struct super_block *sb;
	struct dentry *root;

	root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
				      fc->source, ctx->legacy_data);
	if (IS_ERR(root))
		return PTR_ERR(root);

	sb = root->d_sb;
	BUG_ON(!sb);

	fc->root = root;
	return 0;
}

/*
 * Handle remount.
 */
static int legacy_reconfigure(struct fs_context *fc)
{
	struct legacy_fs_context *ctx = fc->fs_private;
	struct super_block *sb = fc->root->d_sb;

	if (!sb->s_op->remount_fs)
		return 0;

	return sb->s_op->remount_fs(sb, &fc->sb_flags,
				    ctx ? ctx->legacy_data : NULL);
}

const struct fs_context_operations legacy_fs_context_ops = {
	.free			= legacy_fs_context_free,
	.dup			= legacy_fs_context_dup,
	.parse_param		= legacy_parse_param,
	.parse_monolithic	= legacy_parse_monolithic,
	.get_tree		= legacy_get_tree,
	.reconfigure		= legacy_reconfigure,
};

/*
 * Initialise a legacy context for a filesystem that doesn't support
 * fs_context.
 */
static int legacy_init_fs_context(struct fs_context *fc)
{
	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
	if (!fc->fs_private)
		return -ENOMEM;
	fc->ops = &legacy_fs_context_ops;
	return 0;
}

int parse_monolithic_mount_data(struct fs_context *fc, void *data)
{
	int (*monolithic_mount_data)(struct fs_context *, void *);
@@ -757,10 +557,8 @@ int finish_clean_context(struct fs_context *fc)
	if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
		return 0;

	if (fc->fs_type->init_fs_context)
	error = fc->fs_type->init_fs_context(fc);
	else
		error = legacy_init_fs_context(fc);

	if (unlikely(error)) {
		fc->phase = FS_CONTEXT_FAILED;
		return error;
Loading