Unverified Commit a7ebb0fe authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "Support foreign mount namespace with statmount/listmount"

Josef Bacik <josef@toxicpanda.com> says:

Currently the only way to iterate over mount entries in mount namespaces that
aren't your own is to trawl through /proc in order to find /proc/$PID/mountinfo
for the mount namespace that you want.  This is hugely inefficient, so extend
both statmount() and listmount() to allow specifying a mount namespace id in
order to get to mounts in other mount namespaces.

There are a few components to this

1. Having a global index of the mount namespace based on the ->seq value in the
   mount namespace.  This gives us a unique identifier that isn't re-used.
2. Support looking up mount namespaces based on that unique identifier, and
   validating the user has permission to access the given mount namespace.
3. Provide a new ioctl() on nsfs in order to extract the unique identifier we
   can use for statmount() and listmount().

The code is relatively straightforward, and there is a selftest provided to
validate everything works properly.

This is based on vfs.all as of last week, so must be applied onto a tree that
has Christians error handling rework in this area.  If you wish you can pull the
tree directly here

https://github.com/josefbacik/linux/tree/listmount.combined

Christian and I collaborated on this series, which is why there's patches from
both of us in this series.

Christian Brauner (4):
  fs: relax permissions for listmount()
  fs: relax permissions for statmount()
  fs: Allow listmount() in foreign mount namespace
  fs: Allow statmount() in foreign mount namespace

Josef Bacik (4):
  fs: keep an index of current mount namespaces
  fs: export the mount ns id via statmount
  fs: add an ioctl to get the mnt ns id from nsfs
  selftests: add a test for the foreign mnt ns extensions

fs/mount.h                                    |   2 +
 fs/namespace.c                                | 240 ++++++++++--
 fs/nsfs.c                                     |  14 +
 include/uapi/linux/mount.h                    |   6 +-
 include/uapi/linux/nsfs.h                     |   2 +
 .../selftests/filesystems/statmount/Makefile  |   2 +-
 .../filesystems/statmount/statmount.h         |  46 +++
 .../filesystems/statmount/statmount_test.c    |  53 +--
 .../filesystems/statmount/statmount_test_ns.c | 360 ++++++++++++++++++
 9 files changed, 659 insertions(+), 66 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h
 create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c

Link: https://lore.kernel.org/r/cover.1719243756.git.josef@toxicpanda.com


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents d04bccd8 d896f71c
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -16,6 +16,8 @@ struct mnt_namespace {
	u64 event;
	unsigned int		nr_mounts; /* # of mounts in the namespace */
	unsigned int		pending_mounts;
	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
	refcount_t		passive; /* number references not pinning @mounts */
} __randomize_layout;

struct mnt_pcp {
+216 −24
Original line number Diff line number Diff line
@@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */

struct mount_kattr {
	unsigned int attr_set;
@@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj);
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
	u64 seq_b = ns->seq;

	if (seq < seq_b)
		return -1;
	if (seq > seq_b)
		return 1;
	return 0;
}

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
	if (!node)
		return NULL;
	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}

static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
{
	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
	u64 seq_a = ns_a->seq;

	return mnt_ns_cmp(seq_a, ns_b) < 0;
}

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
	guard(write_lock)(&mnt_ns_tree_lock);
	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
}

static void mnt_ns_release(struct mnt_namespace *ns)
{
	lockdep_assert_not_held(&mnt_ns_tree_lock);

	/* keep alive for {list,stat}mount() */
	if (refcount_dec_and_test(&ns->passive)) {
		put_user_ns(ns->user_ns);
		kfree(ns);
	}
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
	/* remove from global mount namespace list */
	if (!is_anon_ns(ns)) {
		guard(write_lock)(&mnt_ns_tree_lock);
		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
	}

	mnt_ns_release(ns);
}

/*
 * Returns the mount namespace which either has the specified id, or has the
 * next smallest id afer the specified one.
 */
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
{
	struct rb_node *node = mnt_ns_tree.rb_node;
	struct mnt_namespace *ret = NULL;

	lockdep_assert_held(&mnt_ns_tree_lock);

	while (node) {
		struct mnt_namespace *n = node_to_mnt_ns(node);

		if (mnt_ns_id <= n->seq) {
			ret = node_to_mnt_ns(node);
			if (mnt_ns_id == n->seq)
				break;
			node = node->rb_left;
		} else {
			node = node->rb_right;
		}
	}
	return ret;
}

/*
 * Lookup a mount namespace by id and take a passive reference count. Taking a
 * passive reference means the mount namespace can be emptied if e.g., the last
 * task holding an active reference exits. To access the mounts of the
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
       struct mnt_namespace *ns;

       guard(read_lock)(&mnt_ns_tree_lock);
       ns = mnt_ns_find_id_at(mnt_ns_id);
       if (!ns || ns->seq != mnt_ns_id)
               return NULL;

       refcount_inc(&ns->passive);
       return ns;
}

static inline void lock_mount_hash(void)
{
	write_seqlock(&mount_lock);
@@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns)
	if (!is_anon_ns(ns))
		ns_free_inum(&ns->ns);
	dec_mnt_namespaces(ns->ucounts);
	put_user_ns(ns->user_ns);
	kfree(ns);
	mnt_ns_tree_remove(ns);
}

/*
@@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
	if (!anon)
		new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
	refcount_set(&new_ns->ns.count, 1);
	refcount_set(&new_ns->passive, 1);
	new_ns->mounts = RB_ROOT;
	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
	init_waitqueue_head(&new_ns->poll);
	new_ns->user_ns = get_user_ns(user_ns);
	new_ns->ucounts = ucounts;
@@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
		while (p->mnt.mnt_root != q->mnt.mnt_root)
			p = next_mnt(skip_mnt_tree(p), old);
	}
	mnt_ns_tree_add(new_ns);
	namespace_unlock();

	if (rootmnt)
@@ -4867,6 +4974,12 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
	return 0;
}

static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
	s->sm.mask |= STATMOUNT_MNT_NS_ID;
	s->sm.mnt_ns_id = ns->seq;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
	int ret;
@@ -4930,6 +5043,7 @@ static int copy_statmount_to_user(struct kstatmount *s)
static int do_statmount(struct kstatmount *s)
{
	struct mount *m = real_mount(s->mnt);
	struct mnt_namespace *ns = m->mnt_ns;
	int err;

	/*
@@ -4937,7 +5051,7 @@ static int do_statmount(struct kstatmount *s)
	 * mounts to show users.
	 */
	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
		return -EPERM;

	err = security_sb_statfs(s->mnt->mnt_root);
@@ -4962,6 +5076,9 @@ static int do_statmount(struct kstatmount *s)
	if (!err && s->mask & STATMOUNT_MNT_POINT)
		err = statmount_string(s, STATMOUNT_MNT_POINT);

	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
		statmount_mnt_ns_id(s, ns);

	if (err)
		return err;

@@ -5003,7 +5120,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
	int ret;
	size_t usize;

	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);

	ret = get_user(usize, &req->size);
	if (ret)
@@ -5021,10 +5138,63 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
	return 0;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
	struct rb_node *node;

	if (reverse)
		node = rb_prev(&curr->mnt_node);
	else
		node = rb_next(&curr->mnt_node);

	return node_to_mount(node);
}

static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
{
	struct mount *first;

	rwsem_assert_held(&namespace_sem);

	/* We're looking at our own ns, just use get_fs_root. */
	if (ns == current->nsproxy->mnt_ns) {
		get_fs_root(current->fs, root);
		return 0;
	}

	/*
	 * We have to find the first mount in our ns and use that, however it
	 * may not exist, so handle that properly.
	 */
	if (RB_EMPTY_ROOT(&ns->mounts))
		return -ENOENT;

	first = listmnt_next(ns->root, false);
	if (!first)
		return -ENOENT;
	root->mnt = mntget(&first->mnt);
	root->dentry = dget(root->mnt->mnt_root);
	return 0;
}

/*
 * If the user requested a specific mount namespace id, look that up and return
 * that, or if not simply grab a passive reference on our mount namespace and
 * return that.
 */
static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id)
{
	if (mnt_ns_id)
		return lookup_mnt_ns(mnt_ns_id);
	refcount_inc(&current->nsproxy->mnt_ns->passive);
	return current->nsproxy->mnt_ns;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
		struct statmount __user *, buf, size_t, bufsize,
		unsigned int, flags)
{
	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
	struct vfsmount *mnt;
	struct mnt_id_req kreq;
	struct kstatmount ks;
@@ -5039,13 +5209,28 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
	if (ret)
		return ret;

	ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
	if (!ns)
		return -ENOENT;

	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
		return -ENOENT;

retry:
	ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
	if (ret)
		return ret;

	down_read(&namespace_sem);
	mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
	/* Has the namespace already been emptied? */
	if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) {
		up_read(&namespace_sem);
		kvfree(ks.seq.buf);
		return -ENOENT;
	}

	mnt = lookup_mnt_in_ns(kreq.mnt_id, ns);
	if (!mnt) {
		up_read(&namespace_sem);
		kvfree(ks.seq.buf);
@@ -5053,7 +5238,12 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
	}

	ks.mnt = mnt;
	get_fs_root(current->fs, &ks.root);
	ret = grab_requested_root(ns, &ks.root);
	if (ret) {
		up_read(&namespace_sem);
		kvfree(ks.seq.buf);
		return ret;
	}
	ret = do_statmount(&ks);
	path_put(&ks.root);
	up_read(&namespace_sem);
@@ -5066,30 +5256,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
	return ret;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
	struct rb_node *node;

	if (reverse)
		node = rb_prev(&curr->mnt_node);
	else
		node = rb_next(&curr->mnt_node);

	return node_to_mount(node);
}

static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
			    size_t nr_mnt_ids, bool reverse)
static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
			    u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
			    bool reverse)
{
	struct path root __free(path_put) = {};
	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
	struct path orig;
	struct mount *r, *first;
	ssize_t ret;

	rwsem_assert_held(&namespace_sem);

	get_fs_root(current->fs, &root);
	ret = grab_requested_root(ns, &root);
	if (ret)
		return ret;

	if (mnt_parent_id == LSMT_ROOT) {
		orig = root;
	} else {
@@ -5104,7 +5285,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids,
	 * mounts to show users.
	 */
	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
	    !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
		return -EPERM;

	ret = security_sb_statfs(orig.dentry);
@@ -5141,6 +5322,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
{
	u64 *kmnt_ids __free(kvfree) = NULL;
	const size_t maxcount = 1000000;
	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
	struct mnt_id_req kreq;
	ssize_t ret;

@@ -5167,8 +5349,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
	if (!kmnt_ids)
		return -ENOMEM;

	ns = grab_requested_mnt_ns(kreq.mnt_ns_id);
	if (!ns)
		return -ENOENT;

	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
		return -ENOENT;

	scoped_guard(rwsem_read, &namespace_sem)
		ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids,
		ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids,
				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));

	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
@@ -5204,6 +5394,8 @@ static void __init init_mount_tree(void)

	set_fs_pwd(current->fs, &root);
	set_fs_root(current->fs, &root);

	mnt_ns_tree_add(ns);
}

void __init mnt_init(void)
+14 −0
Original line number Diff line number Diff line
@@ -12,6 +12,7 @@
#include <linux/nsfs.h>
#include <linux/uaccess.h>

#include "mount.h"
#include "internal.h"

static struct vfsmount *nsfs_mnt;
@@ -143,6 +144,19 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
		argp = (uid_t __user *) arg;
		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
		return put_user(uid, argp);
	case NS_GET_MNTNS_ID: {
		struct mnt_namespace *mnt_ns;
		__u64 __user *idp;
		__u64 id;

		if (ns->ops->type != CLONE_NEWNS)
			return -EINVAL;

		mnt_ns = container_of(ns, struct mnt_namespace, ns);
		idp = (__u64 __user *)arg;
		id = mnt_ns->seq;
		return put_user(id, idp);
	}
	default:
		return -ENOTTY;
	}
+5 −1
Original line number Diff line number Diff line
@@ -172,7 +172,8 @@ struct statmount {
	__u64 propagate_from;	/* Propagation from in current namespace */
	__u32 mnt_root;		/* [str] Root of mount relative to root of fs */
	__u32 mnt_point;	/* [str] Mountpoint relative to current root */
	__u64 __spare2[50];
	__u64 mnt_ns_id;	/* ID of the mount namespace */
	__u64 __spare2[49];
	char str[];		/* Variable size part containing strings */
};

@@ -188,10 +189,12 @@ struct mnt_id_req {
	__u32 spare;
	__u64 mnt_id;
	__u64 param;
	__u64 mnt_ns_id;
};

/* List of all mnt_id_req versions. */
#define MNT_ID_REQ_SIZE_VER0	24 /* sizeof first published struct */
#define MNT_ID_REQ_SIZE_VER1	32 /* sizeof second published struct */

/*
 * @mask bits for statmount(2)
@@ -202,6 +205,7 @@ struct mnt_id_req {
#define STATMOUNT_MNT_ROOT		0x00000008U	/* Want/got mnt_root  */
#define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got mnt_point */
#define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got fs_type */
#define STATMOUNT_MNT_NS_ID		0x00000040U	/* Want/got mnt_ns_id */

/*
 * Special @mnt_id values that can be passed to listmount
+2 −0
Original line number Diff line number Diff line
@@ -15,5 +15,7 @@
#define NS_GET_NSTYPE		_IO(NSIO, 0x3)
/* Get owner UID (in the caller's user namespace) for a user namespace */
#define NS_GET_OWNER_UID	_IO(NSIO, 0x4)
/* Get the id for a mount namespace */
#define NS_GET_MNTNS_ID		_IO(NSIO, 0x5)

#endif /* __LINUX_NSFS_H */
Loading