Merge tag 'vfs-7.1-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs (7c8a4671) · Commits · git / linux-net

fs/namespace.c

+112 −82

Original line number	Diff line number	Diff line
		@@ -2646,6 +2646,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,

		if (unlikely(shorter) && child != source_mnt)
		mp = shorter;
		/*
		* If @q was locked it was meant to hide
		* whatever was under it. Let @child take over
		* that job and lock it, then we can unlock @q.
		* That'll allow another namespace to shed @q
		* and reveal @child. Clearly, that mounter
		* consented to this by not severing the mount
		* relationship. Otherwise, what's the point.
		*/
		if (IS_MNT_LOCKED(q)) {
		child->mnt.mnt_flags \|= MNT_LOCKED;
		q->mnt.mnt_flags &= ~MNT_LOCKED;
		}
		mnt_change_mountpoint(r, mp, q);
		}
		}
		@@ -2722,7 +2735,7 @@ static inline struct mount where_to_mount(const struct path path,
		* In all cases the location must not have been unmounted and the
		* chosen mountpoint must be allowed to be mounted on. For "beneath"
		* case we also require the location to be at the root of a mount
		* that has a parent (i.e. is not a root of some namespace).
		* that has something mounted on top of it (i.e. has an overmount).
		*/
		static void do_lock_mount(const struct path *path,
		struct pinned_mountpoint *res,
		@@ -2958,10 +2971,9 @@ static inline bool may_copy_tree(const struct path *path)
		}

		static struct mount __do_loopback(const struct path old_path,
		unsigned int flags, unsigned int copy_flags)
		bool recurse, unsigned int copy_flags)
		{
		struct mount *old = real_mount(old_path->mnt);
		bool recurse = flags & AT_RECURSIVE;

		if (IS_MNT_UNBINDABLE(old))
		return ERR_PTR(-EINVAL);
		@@ -2972,18 +2984,6 @@ static struct mount __do_loopback(const struct path old_path,
		if (!recurse && __has_locked_children(old, old_path->dentry))
		return ERR_PTR(-EINVAL);

		/*
		* When creating a new mount namespace we don't want to copy over
		* mounts of mount namespaces to avoid the risk of cycles and also to
		* minimize the default complex interdependencies between mount
		* namespaces.
		*
		* We could ofc just check whether all mount namespace files aren't
		* creating cycles but really let's keep this simple.
		*/
		if (!(flags & OPEN_TREE_NAMESPACE))
		copy_flags \|= CL_COPY_MNT_NS_FILE;

		if (recurse)
		return copy_tree(old, old_path->dentry, copy_flags);

		@@ -2998,7 +2998,6 @@ static int do_loopback(const struct path path, const char old_name,
		{
		struct path old_path __free(path_put) = {};
		struct mount *mnt = NULL;
		unsigned int flags = recurse ? AT_RECURSIVE : 0;
		int err;

		if (!old_name \|\| !*old_name)
		@@ -3017,7 +3016,7 @@ static int do_loopback(const struct path path, const char old_name,
		if (!check_mnt(mp.parent))
		return -EINVAL;

		mnt = __do_loopback(&old_path, flags, 0);
		mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE);
		if (IS_ERR(mnt))
		return PTR_ERR(mnt);

		@@ -3055,7 +3054,7 @@ static struct mnt_namespace get_detached_copy(const struct path path, unsigned
		ns->seq_origin = src_mnt_ns->ns.ns_id;
		}

		mnt = __do_loopback(path, flags, 0);
		mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE);
		if (IS_ERR(mnt)) {
		emptied_ns = ns;
		return ERR_CAST(mnt);
		@@ -3087,7 +3086,13 @@ static struct file open_detached_copy(struct path path, unsigned int flags)
		return file;
		}

		static struct mnt_namespace create_new_namespace(struct path path, unsigned int flags)
		enum mount_copy_flags_t {
		MOUNT_COPY_RECURSIVE = (1 << 0),
		MOUNT_COPY_NEW = (1 << 1),
		};

		static struct mnt_namespace create_new_namespace(struct path path,
		enum mount_copy_flags_t flags)
		{
		struct mnt_namespace *ns = current->nsproxy->mnt_ns;
		struct user_namespace *user_ns = current_user_ns();
		@@ -3096,7 +3101,7 @@ static struct mnt_namespace create_new_namespace(struct path path, unsigned in
		struct path to_path;
		struct mount *mnt;
		unsigned int copy_flags = 0;
		bool locked = false;
		bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE;

		if (user_ns != ns->user_ns)
		copy_flags \|= CL_SLAVE;
		@@ -3131,11 +3136,14 @@ static struct mnt_namespace create_new_namespace(struct path path, unsigned in
		}

		/*
		* We don't emulate unshare()ing a mount namespace. We stick
		* to the restrictions of creating detached bind-mounts. It
		* has a lot saner and simpler semantics.
		* We don't emulate unshare()ing a mount namespace. We stick to
		* the restrictions of creating detached bind-mounts. It has a
		* lot saner and simpler semantics.
		*/
		mnt = __do_loopback(path, flags, copy_flags);
		if (flags & MOUNT_COPY_NEW)
		mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags);
		else
		mnt = __do_loopback(path, recurse, copy_flags);
		scoped_guard(mount_writer) {
		if (IS_ERR(mnt)) {
		emptied_ns = new_ns;
		@@ -3164,7 +3172,8 @@ static struct mnt_namespace create_new_namespace(struct path path, unsigned in
		return new_ns;
		}

		static struct file open_new_namespace(struct path path, unsigned int flags)
		static struct file open_new_namespace(struct path path,
		enum mount_copy_flags_t flags)
		{
		struct mnt_namespace *new_ns;

		@@ -3217,7 +3226,7 @@ static struct file vfs_open_tree(int dfd, const char __user filename, unsigned
		return ERR_PTR(ret);

		if (flags & OPEN_TREE_NAMESPACE)
		return open_new_namespace(&path, flags);
		return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0);

		if (flags & OPEN_TREE_CLONE)
		return open_detached_copy(&path, flags);
		@@ -3513,8 +3522,6 @@ static bool mount_is_ancestor(const struct mount p1, const struct mount p2)
		* @mnt_to: mount under which to mount
		* @mp: mountpoint of @mnt_to
		*
		* - Make sure that nothing can be mounted beneath the caller's current
		* root or the rootfs of the namespace.
		* - Make sure that the caller can unmount the topmost mount ensuring
		* that the caller could reveal the underlying mountpoint.
		* - Ensure that nothing has been mounted on top of @mnt_from before we
		@@ -3528,26 +3535,14 @@ static bool mount_is_ancestor(const struct mount p1, const struct mount p2)
		*/
		static int can_move_mount_beneath(const struct mount *mnt_from,
		const struct mount *mnt_to,
		const struct mountpoint *mp)
		struct pinned_mountpoint *mp)
		{
		struct mount *parent_mnt_to = mnt_to->mnt_parent;

		if (IS_MNT_LOCKED(mnt_to))
		return -EINVAL;

		/* Avoid creating shadow mounts during mount propagation. */
		if (mnt_from->overmount)
		return -EINVAL;

		/*
		* Mounting beneath the rootfs only makes sense when the
		* semantics of pivot_root(".", ".") are used.
		*/
		if (&mnt_to->mnt == current->fs->root.mnt)
		return -EINVAL;
		if (parent_mnt_to == current->nsproxy->mnt_ns->root)
		return -EINVAL;

		if (mount_is_ancestor(mnt_to, mnt_from))
		return -EINVAL;

		@@ -3557,7 +3552,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
		* propagating a copy @c of @mnt_from on top of @mnt_to. This
		* defeats the whole purpose of mounting beneath another mount.
		*/
		if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
		if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp))
		return -EINVAL;

		/*
		@@ -3573,7 +3568,7 @@ static int can_move_mount_beneath(const struct mount *mnt_from,
		* @mnt_from beneath @mnt_to.
		*/
		if (check_mnt(mnt_from) &&
		propagation_would_overmount(parent_mnt_to, mnt_from, mp))
		propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp))
		return -EINVAL;

		return 0;
		@@ -3682,7 +3677,7 @@ static int do_move_mount(const struct path *old_path,

		if (mp.parent != over->mnt_parent)
		over = mp.parent->overmount;
		err = can_move_mount_beneath(old, over, mp.mp);
		err = can_move_mount_beneath(old, over, &mp);
		if (err)
		return err;
		}
		@@ -4231,8 +4226,8 @@ struct mnt_namespace copy_mnt_ns(u64 flags, struct mnt_namespace ns,
		struct user_namespace user_ns, struct fs_struct new_fs)
		{
		struct mnt_namespace *new_ns;
		struct vfsmount *rootmnt __free(mntput) = NULL;
		struct vfsmount *pwdmnt __free(mntput) = NULL;
		struct path old_root __free(path_put) = {};
		struct path old_pwd __free(path_put) = {};
		struct mount p, q;
		struct mount *old;
		struct mount *new;
		@@ -4252,10 +4247,17 @@ struct mnt_namespace copy_mnt_ns(u64 flags, struct mnt_namespace ns,
		return new_ns;

		guard(namespace_excl)();
		/* First pass: copy the tree topology */

		if (flags & CLONE_EMPTY_MNTNS)
		copy_flags = 0;
		else
		copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
		if (user_ns != ns->user_ns)
		copy_flags \|= CL_SLAVE;

		if (flags & CLONE_EMPTY_MNTNS)
		new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
		else
		new = copy_tree(old, old->mnt.mnt_root, copy_flags);
		if (IS_ERR(new)) {
		emptied_ns = new_ns;
		@@ -4267,10 +4269,29 @@ struct mnt_namespace copy_mnt_ns(u64 flags, struct mnt_namespace ns,
		}
		new_ns->root = new;

		if (flags & CLONE_EMPTY_MNTNS) {
		/*
		* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
		* as belonging to new namespace. We have already acquired a private
		* fs_struct, so tsk->fs->lock is not needed.
		* Empty mount namespace: only the root mount exists.
		* Reset root and pwd to the cloned mount's root dentry.
		*/
		if (new_fs) {
		old_root = new_fs->root;
		old_pwd = new_fs->pwd;

		new_fs->root.mnt = mntget(&new->mnt);
		new_fs->root.dentry = dget(new->mnt.mnt_root);

		new_fs->pwd.mnt = mntget(&new->mnt);
		new_fs->pwd.dentry = dget(new->mnt.mnt_root);
		}
		mnt_add_to_ns(new_ns, new);
		new_ns->nr_mounts++;
		} else {
		/*
		* Full copy: walk old and new trees in parallel, switching
		* the tsk->fs->* elements and marking new vfsmounts as
		* belonging to new namespace. We have already acquired a
		* private fs_struct, so tsk->fs->lock is not needed.
		*/
		p = old;
		q = new;
		@@ -4279,12 +4300,12 @@ struct mnt_namespace copy_mnt_ns(u64 flags, struct mnt_namespace ns,
		new_ns->nr_mounts++;
		if (new_fs) {
		if (&p->mnt == new_fs->root.mnt) {
		old_root.mnt = new_fs->root.mnt;
		new_fs->root.mnt = mntget(&q->mnt);
		rootmnt = &p->mnt;
		}
		if (&p->mnt == new_fs->pwd.mnt) {
		old_pwd.mnt = new_fs->pwd.mnt;
		new_fs->pwd.mnt = mntget(&q->mnt);
		pwdmnt = &p->mnt;
		}
		}
		p = next_mnt(p, old);
		@@ -4295,6 +4316,7 @@ struct mnt_namespace copy_mnt_ns(u64 flags, struct mnt_namespace ns,
		while (p->mnt.mnt_root != q->mnt.mnt_root)
		p = next_mnt(skip_mnt_tree(p), old);
		}
		}
		ns_tree_add_raw(new_ns);
		return new_ns;
		}
		@@ -4414,11 +4436,15 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
		unsigned int mnt_flags = 0;
		long ret;

		if (!may_mount())
		if ((flags & ~(FSMOUNT_CLOEXEC \| FSMOUNT_NAMESPACE)) != 0)
		return -EINVAL;

		if ((flags & FSMOUNT_NAMESPACE) &&
		!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
		return -EPERM;

		if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
		return -EINVAL;
		if (!(flags & FSMOUNT_NAMESPACE) && !may_mount())
		return -EPERM;

		if (attr_flags & ~FSMOUNT_VALID_FLAGS)
		return -EINVAL;
		@@ -4485,6 +4511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
		*/
		vfs_clean_context(fc);

		if (flags & FSMOUNT_NAMESPACE)
		return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0,
		open_new_namespace(&new_path, MOUNT_COPY_NEW));

		ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
		if (IS_ERR(ns))
		return PTR_ERR(ns);
		@@ -5649,14 +5679,14 @@ static int grab_requested_root(struct mnt_namespace ns, struct path root)
		if (mnt_ns_empty(ns))
		return -ENOENT;

		first = child = ns->root;
		for (;;) {
		child = listmnt_next(child, false);
		if (!child)
		return -ENOENT;
		if (child->mnt_parent == first)
		first = ns->root;
		for (child = node_to_mount(ns->mnt_first_node); child;
		child = listmnt_next(child, false)) {
		if (child != first && child->mnt_parent == first)
		break;
		}
		if (!child)
		return -ENOENT;

		root->mnt = mntget(&child->mnt);
		root->dentry = dget(root->mnt->mnt_root);

include/uapi/linux/mount.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -110,6 +110,7 @@ enum fsconfig_command {
		* fsmount() flags.
		*/
		#define FSMOUNT_CLOEXEC 0x00000001
		#define FSMOUNT_NAMESPACE 0x00000002 /* Create the mount in a new mount namespace */

		/*
		* Mount attributes.

include/uapi/linux/sched.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -39,6 +39,7 @@
		#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */
		#define CLONE_NNP (1ULL << 35) /* Set no_new_privs on child. */
		#define CLONE_PIDFD_AUTOKILL (1ULL << 36) /* Kill child when clone pidfd closes. */
		#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */

		/*
		* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
		@@ -46,6 +47,12 @@
		*/
		#define CLONE_NEWTIME 0x00000080 /* New time namespace */

		/*
		* unshare flags share the bit space with clone flags but only apply to the
		* unshare syscall:
		*/
		#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */

		#ifndef __ASSEMBLY__
		/**
		* struct clone_args - arguments for the clone3 syscall

kernel/fork.c

+16 −3

Original line number	Diff line number	Diff line
		@@ -2666,6 +2666,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
		int trace = 0;
		pid_t nr;

		/*
		* Creating an empty mount namespace implies creating a new mount
		* namespace. Set this before copy_process() so that the
		* CLONE_NEWNS\|CLONE_FS mutual exclusion check works correctly.
		*/
		if (clone_flags & CLONE_EMPTY_MNTNS) {
		clone_flags \|= CLONE_NEWNS;
		args->flags = clone_flags;
		}

		/*
		* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
		* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
		@@ -2944,8 +2954,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
		{
		/* Verify that no unknown flags are passed along. */
		if (kargs->flags &
		~(CLONE_LEGACY_FLAGS \| CLONE_CLEAR_SIGHAND \| CLONE_INTO_CGROUP \|
		CLONE_AUTOREAP \| CLONE_NNP \| CLONE_PIDFD_AUTOKILL))
		~(CLONE_LEGACY_FLAGS \| CLONE_CLEAR_SIGHAND \|
		CLONE_INTO_CGROUP \| CLONE_AUTOREAP \| CLONE_NNP \|
		CLONE_PIDFD_AUTOKILL \| CLONE_EMPTY_MNTNS))
		return false;

		/*
		@@ -3096,7 +3107,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
		{
		if (unshare_flags & ~(CLONE_THREAD\|CLONE_FS\|CLONE_SIGHAND\|
		CLONE_VM\|CLONE_FILES\|CLONE_SYSVSEM\|
		CLONE_NS_ALL))
		CLONE_NS_ALL \| UNSHARE_EMPTY_MNTNS))
		return -EINVAL;
		/*
		* Not implemented, but pretend it works if there is nothing
		@@ -3195,6 +3206,8 @@ int ksys_unshare(unsigned long unshare_flags)
		/*
		* If unsharing namespace, must also unshare filesystem information.
		*/
		if (unshare_flags & UNSHARE_EMPTY_MNTNS)
		unshare_flags \|= CLONE_NEWNS;
		if (unshare_flags & CLONE_NEWNS)
		unshare_flags \|= CLONE_FS;

kernel/nsproxy.c

+14 −3

Original line number	Diff line number	Diff line
		@@ -96,7 +96,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
		if (!new_nsp)
		return ERR_PTR(-ENOMEM);

		new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
		new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
		user_ns, new_fs);
		if (IS_ERR(new_nsp->mnt_ns)) {
		err = PTR_ERR(new_nsp->mnt_ns);
		goto out_ns;
		@@ -211,16 +212,26 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
		struct nsproxy *new_nsp, struct cred new_cred, struct fs_struct *new_fs)
		{
		struct user_namespace *user_ns;
		u64 flags = unshare_flags;
		int err = 0;

		if (!(unshare_flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
		if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))
		return 0;

		user_ns = new_cred ? new_cred->user_ns : current_user_ns();
		if (!ns_capable(user_ns, CAP_SYS_ADMIN))
		return -EPERM;

		*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
		/*
		* Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
		* CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
		*/
		if (flags & UNSHARE_EMPTY_MNTNS) {
		flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
		flags \|= CLONE_EMPTY_MNTNS;
		}

		*new_nsp = create_new_namespaces(flags, current, user_ns,
		new_fs ? new_fs : current->fs);
		if (IS_ERR(*new_nsp)) {
		err = PTR_ERR(*new_nsp);