Unverified Commit a41dbf5e authored by Christian Brauner's avatar Christian Brauner
Browse files

mount: hold namespace_sem across copy in create_new_namespace()



Fix an oversight when creating a new mount namespace. If someone had the
bright idea to make the real rootfs a shared or dependent mount and it
is later copied the copy will become a peer of the old real rootfs mount
or a dependent mount of it. The namespace semaphore is dropped and we
use mount lock exact to lock the new real root mount. If that fails or
the subsequent do_loopback() fails we rely on the copy of the real root
mount to be cleaned up by path_put(). The problem is that this doesn't
deal with mount propagation and will leave the mounts linked in the
propagation lists.

When creating a new mount namespace create_new_namespace() first
acquires namespace_sem to clone the nullfs root, drops it, then
reacquires it via LOCK_MOUNT_EXACT which takes inode_lock first to
respect the inode_lock -> namespace_sem lock ordering. This
drop-and-reacquire pattern is fragile and was the source of the
propagation cleanup bug fixed in the preceding commit.

Extend lock_mount_exact() with a copy_mount mode that clones the mount
under the locks atomically. When copy_mount is true, path_overmounted()
is skipped since we're copying the mount, not mounting on top of it -
the nullfs root always has rootfs mounted on top so the check would
always fail. If clone_mnt() fails after get_mountpoint() has pinned the
mountpoint, __unlock_mount() is used to properly unpin the mountpoint
and release both locks.

This allows create_new_namespace() to use LOCK_MOUNT_EXACT_COPY which
takes inode_lock and namespace_sem once and holds them throughout the
clone and subsequent mount operations, eliminating the
drop-and-reacquire pattern entirely.

Reported-by: default avatar <syzbot+a89f9434fb5a001ccd58@syzkaller.appspotmail.com>
Fixes: 9b8a0ba6 ("mount: add OPEN_TREE_NAMESPACE") # mainline only
Link: https://lore.kernel.org/699047f6.050a0220.2757fb.0024.GAE@google.com


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent ac838961
Loading
Loading
Loading
Loading
+57 −54
Original line number Diff line number Diff line
@@ -2791,7 +2791,8 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
}

static void lock_mount_exact(const struct path *path,
			     struct pinned_mountpoint *mp);
			     struct pinned_mountpoint *mp, bool copy_mount,
			     unsigned int copy_flags);

#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
@@ -2799,7 +2800,10 @@ static void lock_mount_exact(const struct path *path,
#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false)
#define LOCK_MOUNT_EXACT(mp, path) \
	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
	lock_mount_exact((path), &mp)
	lock_mount_exact((path), &mp, false, 0)
#define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \
	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
	lock_mount_exact((path), &mp, true, (copy_flags))

static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp)
{
@@ -3073,16 +3077,13 @@ static struct file *open_detached_copy(struct path *path, unsigned int flags)
	return file;
}

DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))

static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
{
	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
	struct path to_path __free(path_put) = {};
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct user_namespace *user_ns = current_user_ns();
	struct mount *new_ns_root;
	struct mnt_namespace *new_ns;
	struct mount *new_ns_root, *old_ns_root;
	struct path to_path;
	struct mount *mnt;
	unsigned int copy_flags = 0;
	bool locked = false;
@@ -3094,71 +3095,63 @@ static struct mnt_namespace *create_new_namespace(struct path *path, unsigned in
	if (IS_ERR(new_ns))
		return ERR_CAST(new_ns);

	scoped_guard(namespace_excl) {
		new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
		if (IS_ERR(new_ns_root))
			return ERR_CAST(new_ns_root);
	old_ns_root = ns->root;
	to_path.mnt = &old_ns_root->mnt;
	to_path.dentry = old_ns_root->mnt.mnt_root;

	VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type);

	LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags);
	if (IS_ERR(mp.parent)) {
		free_mnt_ns(new_ns);
		return ERR_CAST(mp.parent);
	}
	new_ns_root = mp.parent;

	/*
	 * If the real rootfs had a locked mount on top of it somewhere
	 * in the stack, lock the new mount tree as well so it can't be
	 * exposed.
	 */
		mnt = ns->root;
	mnt = old_ns_root;
	while (mnt->overmount) {
		mnt = mnt->overmount;
		if (mnt->mnt.mnt_flags & MNT_LOCKED)
			locked = true;
	}
	}

	/*
	 * We dropped the namespace semaphore so we can actually lock
	 * the copy for mounting. The copied mount isn't attached to any
	 * mount namespace and it is thus excluded from any propagation.
	 * So realistically we're isolated and the mount can't be
	 * overmounted.
	 */

	/* Borrow the reference from clone_mnt(). */
	to_path.mnt = &new_ns_root->mnt;
	to_path.dentry = dget(new_ns_root->mnt.mnt_root);

	/* Now lock for actual mounting. */
	LOCK_MOUNT_EXACT(mp, &to_path);
	if (unlikely(IS_ERR(mp.parent)))
		return ERR_CAST(mp.parent);

	/*
	 * We don't emulate unshare()ing a mount namespace. We stick to the
	 * restrictions of creating detached bind-mounts. It has a lot
	 * saner and simpler semantics.
	 * We don't emulate unshare()ing a mount namespace. We stick
	 * to the restrictions of creating detached bind-mounts. It
	 * has a lot saner and simpler semantics.
	 */
	mnt = __do_loopback(path, flags, copy_flags);
	if (IS_ERR(mnt))
	scoped_guard(mount_writer) {
		if (IS_ERR(mnt)) {
			emptied_ns = new_ns;
			umount_tree(new_ns_root, 0);
			return ERR_CAST(mnt);
		}

	scoped_guard(mount_writer) {
		if (locked)
			mnt->mnt.mnt_flags |= MNT_LOCKED;
		/*
		 * Now mount the detached tree on top of the copy of the
		 * real rootfs we created.
		 * now mount the detached tree on top of the copy
		 * of the real rootfs we created.
		 */
		attach_mnt(mnt, new_ns_root, mp.mp);
		if (user_ns != ns->user_ns)
			lock_mnt_tree(new_ns_root);
	}

	/* Add all mounts to the new namespace. */
	for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
		mnt_add_to_ns(new_ns, p);
	for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) {
		mnt_add_to_ns(new_ns, mnt);
		new_ns->nr_mounts++;
	}

	new_ns->root = real_mount(no_free_ptr(to_path.mnt));
	new_ns->root = new_ns_root;
	ns_tree_add_raw(new_ns);
	return no_free_ptr(new_ns);
	return new_ns;
}

static struct file *open_new_namespace(struct path *path, unsigned int flags)
@@ -3840,16 +3833,20 @@ static int do_new_mount(const struct path *path, const char *fstype,
}

static void lock_mount_exact(const struct path *path,
			     struct pinned_mountpoint *mp)
			     struct pinned_mountpoint *mp, bool copy_mount,
			     unsigned int copy_flags)
{
	struct dentry *dentry = path->dentry;
	int err;

	/* Assert that inode_lock() locked the correct inode. */
	VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path));

	inode_lock(dentry->d_inode);
	namespace_lock();
	if (unlikely(cant_mount(dentry)))
		err = -ENOENT;
	else if (path_overmounted(path))
	else if (!copy_mount && path_overmounted(path))
		err = -EBUSY;
	else
		err = get_mountpoint(dentry, mp);
@@ -3857,9 +3854,15 @@ static void lock_mount_exact(const struct path *path,
		namespace_unlock();
		inode_unlock(dentry->d_inode);
		mp->parent = ERR_PTR(err);
	} else {
		mp->parent = real_mount(path->mnt);
		return;
	}

	if (copy_mount)
		mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags);
	else
		mp->parent = real_mount(path->mnt);
	if (unlikely(IS_ERR(mp->parent)))
		__unlock_mount(mp);
}

int finish_automount(struct vfsmount *__m, const struct path *path)