Unverified Commit 0209e316 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "fsmount: add FSMOUNT_NAMESPACE"

Christian Brauner <brauner@kernel.org> says:

Add FSMOUNT_NAMESPACE flag to fsmount() that creates a new mount
namespace with the newly created filesystem attached to a copy of the
real rootfs. This returns a namespace file descriptor instead of an
O_PATH mount fd, similar to how OPEN_TREE_NAMESPACE works for
open_tree().

This allows creating a new filesystem and immediately placing it in a
new mount namespace in a single operation, which is useful for container
runtimes and other namespace-based isolation mechanisms.

This accompanies OPEN_TREE_NAMESPACE and avoids a needless detour via
OPEN_TREE_NAMESPACE to get the same effect. Will be especially useful
when you mount an actual filesystem to be used as the container rootfs.

* patches from https://patch.msgid.link/20260122-work-fsmount-namespace-v1-0-5ef0a886e646@kernel.org:
  selftests/open_tree_ns: fix compilation
  selftests: add FSMOUNT_NAMESPACE tests
  selftests/statmount: add statmount_alloc() helper
  tools: update mount.h header
  mount: add FSMOUNT_NAMESPACE
  mount: simplify __do_loopback()
  mount: start iterating from start of rbtree

Link: https://patch.msgid.link/20260122-work-fsmount-namespace-v1-0-5ef0a886e646@kernel.org


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 1f318b96 3ac7ea91
Loading
Loading
Loading
Loading
+44 −34
Original line number Diff line number Diff line
@@ -2958,10 +2958,9 @@ static inline bool may_copy_tree(const struct path *path)
}

static struct mount *__do_loopback(const struct path *old_path,
				   unsigned int flags, unsigned int copy_flags)
				   bool recurse, unsigned int copy_flags)
{
	struct mount *old = real_mount(old_path->mnt);
	bool recurse = flags & AT_RECURSIVE;

	if (IS_MNT_UNBINDABLE(old))
		return ERR_PTR(-EINVAL);
@@ -2972,18 +2971,6 @@ static struct mount *__do_loopback(const struct path *old_path,
	if (!recurse && __has_locked_children(old, old_path->dentry))
		return ERR_PTR(-EINVAL);

	/*
	 * When creating a new mount namespace we don't want to copy over
	 * mounts of mount namespaces to avoid the risk of cycles and also to
	 * minimize the default complex interdependencies between mount
	 * namespaces.
	 *
	 * We could ofc just check whether all mount namespace files aren't
	 * creating cycles but really let's keep this simple.
	 */
	if (!(flags & OPEN_TREE_NAMESPACE))
		copy_flags |= CL_COPY_MNT_NS_FILE;

	if (recurse)
		return copy_tree(old, old_path->dentry, copy_flags);

@@ -2998,7 +2985,6 @@ static int do_loopback(const struct path *path, const char *old_name,
{
	struct path old_path __free(path_put) = {};
	struct mount *mnt = NULL;
	unsigned int flags = recurse ? AT_RECURSIVE : 0;
	int err;

	if (!old_name || !*old_name)
@@ -3017,7 +3003,7 @@ static int do_loopback(const struct path *path, const char *old_name,
	if (!check_mnt(mp.parent))
		return -EINVAL;

	mnt = __do_loopback(&old_path, flags, 0);
	mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
	if (IS_ERR(mnt))
		return PTR_ERR(mnt);

@@ -3055,7 +3041,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned
			ns->seq_origin = src_mnt_ns->ns.ns_id;
	}

	mnt = __do_loopback(path, flags, 0);
	mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
	if (IS_ERR(mnt)) {
		emptied_ns = ns;
		return ERR_CAST(mnt);
@@ -3087,7 +3073,8 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
	return file;
}

static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
static struct mnt_namespace *create_new_namespace(struct path *path,
						  bool recurse)
{
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct user_namespace *user_ns = current_user_ns();
@@ -3131,11 +3118,26 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
	}

	/*
	 * We don't emulate unshare()ing a mount namespace. We stick
	 * to the restrictions of creating detached bind-mounts. It
	 * has a lot saner and simpler semantics.
	 * We don't emulate unshare()ing a mount namespace. We stick to
	 * the restrictions of creating detached bind-mounts. It has a
	 * lot saner and simpler semantics.
	 */
	mnt = __do_loopback(path, flags, copy_flags);
	mnt = real_mount(path->mnt);
	if (!mnt->mnt_ns) {
		/*
		 * If we're moving into a new mount namespace via
		 * fsmount() swap the mount ids so the nullfs mount id
		 * is the lowest in the mount namespace avoiding another
		 * useless copy. This is fine we're not attached to any
		 * mount namespace so the mount ids are pure decoration
		 * at that point.
		 */
		swap(mnt->mnt_id_unique, new_ns_root->mnt_id_unique);
		swap(mnt->mnt_id, new_ns_root->mnt_id);
		mntget(&mnt->mnt);
	} else {
		mnt = __do_loopback(path, recurse, copy_flags);
	}
	scoped_guard(mount_writer) {
		if (IS_ERR(mnt)) {
			emptied_ns = new_ns;
@@ -3164,11 +3166,11 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
	return new_ns;
}

static struct file *open_new_namespace(struct path *path, unsigned int flags)
static struct file *open_new_namespace(struct path *path, bool recurse)
{
	struct mnt_namespace *new_ns;

	new_ns = create_new_namespace(path, flags);
	new_ns = create_new_namespace(path, recurse);
	if (IS_ERR(new_ns))
		return ERR_CAST(new_ns);
	return open_namespace_file(to_ns_common(new_ns));
@@ -3217,7 +3219,7 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
		return ERR_PTR(ret);

	if (flags & OPEN_TREE_NAMESPACE)
		return open_new_namespace(&path, flags);
		return open_new_namespace(&path, (flags & AT_RECURSIVE));

	if (flags & OPEN_TREE_CLONE)
		return open_detached_copy(&path, flags);
@@ -4414,11 +4416,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
	unsigned int mnt_flags = 0;
	long ret;

	if (!may_mount())
	if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0)
		return -EINVAL;

	if ((flags & FSMOUNT_NAMESPACE) &&
	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
		return -EPERM;

	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
		return -EINVAL;
	if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
		return -EPERM;

	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
		return -EINVAL;
@@ -4485,6 +4491,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
	 */
	vfs_clean_context(fc);

	if (flags & FSMOUNT_NAMESPACE)
		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
			      open_new_namespace(&new_path, 0));

	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
	if (IS_ERR(ns))
		return PTR_ERR(ns);
@@ -5649,14 +5659,14 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
	if (mnt_ns_empty(ns))
		return -ENOENT;

	first = child = ns->root;
	for (;;) {
		child = listmnt_next(child, false);
		if (!child)
			return -ENOENT;
		if (child->mnt_parent == first)
	first = ns->root;
	for (child = node_to_mount(ns->mnt_first_node); child;
	     child = listmnt_next(child, false)) {
		if (child != first && child->mnt_parent == first)
			break;
	}
	if (!child)
		return -ENOENT;

	root->mnt = mntget(&child->mnt);
	root->dentry = dget(root->mnt->mnt_root);
+1 −0
Original line number Diff line number Diff line
@@ -110,6 +110,7 @@ enum fsconfig_command {
 * fsmount() flags.
 */
#define FSMOUNT_CLOEXEC		0x00000001
#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */

/*
 * Mount attributes.
+12 −2
Original line number Diff line number Diff line
@@ -61,7 +61,8 @@
/*
 * open_tree() flags.
 */
#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
#define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */

/*
@@ -109,6 +110,7 @@ enum fsconfig_command {
 * fsmount() flags.
 */
#define FSMOUNT_CLOEXEC		0x00000001
#define FSMOUNT_NAMESPACE	0x00000002	/* Create the mount in a new mount namespace */

/*
 * Mount attributes.
@@ -197,7 +199,10 @@ struct statmount {
 */
struct mnt_id_req {
	__u32 size;
	__u32 spare;
	union {
		__u32 mnt_ns_fd;
		__u32 mnt_fd;
	};
	__u64 mnt_id;
	__u64 param;
	__u64 mnt_ns_id;
@@ -232,4 +237,9 @@ struct mnt_id_req {
#define LSMT_ROOT		0xffffffffffffffff	/* root mount */
#define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */

/*
 * @flag bits for statmount(2)
 */
#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */

#endif /* _UAPI_LINUX_MOUNT_H */
+1 −0
Original line number Diff line number Diff line
fsmount_ns_test
+10 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0
TEST_GEN_PROGS := fsmount_ns_test

CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
LDLIBS := -lcap

include ../../lib.mk

$(OUTPUT)/fsmount_ns_test: fsmount_ns_test.c ../utils.c
	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
Loading