Unverified Commit 4e9f7592 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "namespace: allow creating empty mount namespaces"

Christian Brauner <brauner@kernel.org> says:

Currently, creating a new mount namespace always copies the entire mount
tree from the caller's namespace.  For containers and sandboxes that
intend to build their mount table from scratch this is wasteful: they
inherit a potentially large mount tree only to immediately tear it down.

This series adds support for creating a mount namespace that contains
only a clone of the root mount, with none of the child mounts.  Two new
flags are introduced:

- CLONE_EMPTY_MNTNS (0x400000000) for clone3(), using the 64-bit flag
  space.
- UNSHARE_EMPTY_MNTNS (0x00100000) for unshare(), reusing the
  CLONE_PARENT_SETTID bit which has no meaning for unshare.

Both flags imply CLONE_NEWNS.  The resulting namespace contains a single
nullfs root mount with an immutable empty directory.  The intended
workflow is to then mount a real filesystem (e.g., tmpfs) over the root
and build the mount table from there.

* patches from https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-0-6eb30529bbb0@kernel.org:
  selftests/filesystems: add clone3 tests for empty mount namespaces
  selftests/filesystems: add tests for empty mount namespaces
  namespace: allow creating empty mount namespaces

Link: https://patch.msgid.link/20260306-work-empty-mntns-consolidated-v1-0-6eb30529bbb0@kernel.org


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 0209e316 5b8ffd63
Loading
Loading
Loading
Loading
+56 −29
Original line number Diff line number Diff line
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
		struct user_namespace *user_ns, struct fs_struct *new_fs)
{
	struct mnt_namespace *new_ns;
	struct vfsmount *rootmnt __free(mntput) = NULL;
	struct vfsmount *pwdmnt __free(mntput) = NULL;
	struct path old_root __free(path_put) = {};
	struct path old_pwd __free(path_put) = {};
	struct mount *p, *q;
	struct mount *old;
	struct mount *new;
@@ -4254,10 +4254,17 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
		return new_ns;

	guard(namespace_excl)();
	/* First pass: copy the tree topology */

	if (flags & CLONE_EMPTY_MNTNS)
		copy_flags = 0;
	else
		copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
	if (user_ns != ns->user_ns)
		copy_flags |= CL_SLAVE;

	if (flags & CLONE_EMPTY_MNTNS)
		new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
	else
		new = copy_tree(old, old->mnt.mnt_root, copy_flags);
	if (IS_ERR(new)) {
		emptied_ns = new_ns;
@@ -4269,10 +4276,29 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
	}
	new_ns->root = new;

	if (flags & CLONE_EMPTY_MNTNS) {
		/*
	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
	 * as belonging to new namespace.  We have already acquired a private
	 * fs_struct, so tsk->fs->lock is not needed.
		 * Empty mount namespace: only the root mount exists.
		 * Reset root and pwd to the cloned mount's root dentry.
		 */
		if (new_fs) {
			old_root = new_fs->root;
			old_pwd = new_fs->pwd;

			new_fs->root.mnt = mntget(&new->mnt);
			new_fs->root.dentry = dget(new->mnt.mnt_root);

			new_fs->pwd.mnt = mntget(&new->mnt);
			new_fs->pwd.dentry = dget(new->mnt.mnt_root);
		}
		mnt_add_to_ns(new_ns, new);
		new_ns->nr_mounts++;
	} else {
		/*
		 * Full copy: walk old and new trees in parallel, switching
		 * the tsk->fs->* elements and marking new vfsmounts as
		 * belonging to new namespace.  We have already acquired a
		 * private fs_struct, so tsk->fs->lock is not needed.
		 */
		p = old;
		q = new;
@@ -4281,12 +4307,12 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
			new_ns->nr_mounts++;
			if (new_fs) {
				if (&p->mnt == new_fs->root.mnt) {
					old_root.mnt = new_fs->root.mnt;
					new_fs->root.mnt = mntget(&q->mnt);
				rootmnt = &p->mnt;
				}
				if (&p->mnt == new_fs->pwd.mnt) {
					old_pwd.mnt = new_fs->pwd.mnt;
					new_fs->pwd.mnt = mntget(&q->mnt);
				pwdmnt = &p->mnt;
				}
			}
			p = next_mnt(p, old);
@@ -4297,6 +4323,7 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
			while (p->mnt.mnt_root != q->mnt.mnt_root)
				p = next_mnt(skip_mnt_tree(p), old);
		}
	}
	ns_tree_add_raw(new_ns);
	return new_ns;
}
+7 −0
Original line number Diff line number Diff line
@@ -36,6 +36,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#define CLONE_EMPTY_MNTNS	(1ULL << 37) /* Create an empty mount namespace. */

/*
 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -43,6 +44,12 @@
 */
#define CLONE_NEWTIME	0x00000080	/* New time namespace */

/*
 * unshare flags share the bit space with clone flags but only apply to the
 * unshare syscall:
 */
#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */

#ifndef __ASSEMBLY__
/**
 * struct clone_args - arguments for the clone3 syscall
+15 −2
Original line number Diff line number Diff line
@@ -2619,6 +2619,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
	int trace = 0;
	pid_t nr;

	/*
	 * Creating an empty mount namespace implies creating a new mount
	 * namespace.  Set this before copy_process() so that the
	 * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
	 */
	if (clone_flags & CLONE_EMPTY_MNTNS) {
		clone_flags |= CLONE_NEWNS;
		args->flags = clone_flags;
	}

	/*
	 * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
	 * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
	/* Verify that no unknown flags are passed along. */
	if (kargs->flags &
	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
	    ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
	      CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
		return false;

	/*
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
				CLONE_NEWTIME))
				CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
		return -EINVAL;
	/*
	 * Not implemented, but pretend it works if there is nothing
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
	/*
	 * If unsharing namespace, must also unshare filesystem information.
	 */
	if (unshare_flags & UNSHARE_EMPTY_MNTNS)
		unshare_flags |= CLONE_NEWNS;
	if (unshare_flags & CLONE_NEWNS)
		unshare_flags |= CLONE_FS;

+16 −5
Original line number Diff line number Diff line
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
	if (!new_nsp)
		return ERR_PTR(-ENOMEM);

	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
	new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
				      user_ns, new_fs);
	if (IS_ERR(new_nsp->mnt_ns)) {
		err = PTR_ERR(new_nsp->mnt_ns);
		goto out_ns;
@@ -212,9 +213,10 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
	struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
	struct user_namespace *user_ns;
	u64 flags = unshare_flags;
	int err = 0;

	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
	if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
		       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
		       CLONE_NEWTIME)))
		return 0;
@@ -223,7 +225,16 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
		return -EPERM;

	*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
	/*
	 * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
	 * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
	 */
	if (flags & UNSHARE_EMPTY_MNTNS) {
		flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
		flags |= CLONE_EMPTY_MNTNS;
	}

	*new_nsp = create_new_namespaces(flags, current, user_ns,
					 new_fs ? new_fs : current->fs);
	if (IS_ERR(*new_nsp)) {
		err = PTR_ERR(*new_nsp);
+4 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
clone3_empty_mntns_test
empty_mntns_test
overmount_chroot_test
Loading