Commit 100ceb48 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfs-6.14-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs mount updates from Christian Brauner:

 - Add a mountinfo program to demonstrate statmount()/listmount()

   Add a new "mountinfo" sample userland program that demonstrates how
   to use statmount() and listmount() to get at the same info that
   /proc/pid/mountinfo provides

 - Remove pointless nospec.h include

 - Prepend statmount.mnt_opts string with security_sb_mnt_opts()

   Currently these mount options aren't accessible via statmount()

 - Add new mount namespaces to mount namespace rbtree outside of the
   namespace semaphore

 - Lockless mount namespace lookup

   Currently we take the read lock when looking for a mount namespace to
   list mounts in. We can make this lockless. The simple search case can
   just use a sequence counter to detect concurrent changes to the
   rbtree

   For walking the list of mount namespaces sequentially via nsfs we
   keep a separate rcu list as rb_prev() and rb_next() aren't usable
   safely with rcu. Currently there is no primitive for retrieving the
   previous list member. To do this we need a new deletion primitive
   that doesn't poison the prev pointer and a corresponding retrieval
   helper

   Since creating mount namespaces is a relatively rare event compared
   with querying mounts in a foreign mount namespace this is worth it.
   Once libmount and systemd pick up this mechanism to list mounts in
   foreign mount namespaces this will be used very frequently

     - Add extended selftests for lockless mount namespace iteration

     - Add a sample program to list all mounts on the system, i.e., in
       all mount namespaces

 - Improve mount namespace iteration performance

   Make finding the last or first mount to start iterating the mount
   namespace from an O(1) operation and add selftests for iterating the
   mount table starting from the first and last mount

 - Use an xarray for the old mount id

   While the ida does use the xarray internally we can use it explicitly
   which allows us to increment the unique mount id under the xa lock.
   This allows us to remove the atomic as we're now allocating both ids
   in one go

 - Use a shared header for vfs sample programs

 - Fix build warnings for new sample program to list all mounts

* tag 'vfs-6.14-rc1.mount.v2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  samples/vfs: fix build warnings
  samples/vfs: use shared header
  samples/vfs/mountinfo: Use __u64 instead of uint64_t
  fs: remove useless lockdep assertion
  fs: use xarray for old mount id
  selftests: add listmount() iteration tests
  fs: cache first and last mount
  samples: add test-list-all-mounts
  selftests: remove unneeded include
  selftests: add tests for mntns iteration
  seltests: move nsfs into filesystems subfolder
  fs: simplify rwlock to spinlock
  fs: lockless mntns lookup for nsfs
  rculist: add list_bidir_{del,prev}_rcu()
  fs: lockless mntns rbtree lookup
  fs: add mount namespace to rbtree late
  fs: prepend statmount.mnt_opts string with security_sb_mnt_opts()
  mount: remove inlude/nospec.h include
  samples: add a mountinfo program to demonstrate statmount()/listmount()
parents 1a89a692 68e6b7d9
Loading
Loading
Loading
Loading
+19 −12
Original line number Diff line number Diff line
@@ -8,15 +8,23 @@
struct mnt_namespace {
	struct ns_common	ns;
	struct mount *	root;
	struct {
		struct rb_root	mounts;		 /* Protected by namespace_sem */
		struct rb_node	*mnt_last_node;	 /* last (rightmost) mount in the rbtree */
		struct rb_node	*mnt_first_node; /* first (leftmost) mount in the rbtree */
	};
	struct user_namespace	*user_ns;
	struct ucounts		*ucounts;
	u64			seq;	/* Sequence number to prevent loops */
	union {
		wait_queue_head_t	poll;
		struct rcu_head		mnt_ns_rcu;
	};
	u64 event;
	unsigned int		nr_mounts; /* # of mounts in the namespace */
	unsigned int		pending_mounts;
	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
	struct list_head	mnt_ns_list; /* entry in the sequential list of mounts namespace */
	refcount_t		passive; /* number references not pinning @mounts */
} __randomize_layout;

@@ -150,22 +158,21 @@ static inline bool mnt_ns_attached(const struct mount *mnt)

static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
	struct mnt_namespace *ns = mnt->mnt_ns;
	WARN_ON(!mnt_ns_attached(mnt));
	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
	if (ns->mnt_last_node == &mnt->mnt_node)
		ns->mnt_last_node = rb_prev(&mnt->mnt_node);
	if (ns->mnt_first_node == &mnt->mnt_node)
		ns->mnt_first_node = rb_next(&mnt->mnt_node);
	rb_erase(&mnt->mnt_node, &ns->mounts);
	RB_CLEAR_NODE(&mnt->mnt_node);
	list_add_tail(&mnt->mnt_list, dt_list);
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
{
	return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
	return __lookup_next_mnt_ns(mntns, true);
}
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
					    bool previous);

static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
	return container_of(ns, struct mnt_namespace, ns);
+124 −76
Original line number Diff line number Diff line
@@ -33,7 +33,6 @@
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>
#include <linux/nospec.h>

#include "pnode.h"
#include "internal.h"
@@ -67,12 +66,12 @@ static int __init set_mphash_entries(char *str)
__setup("mphash_entries=", set_mphash_entries);

static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -80,8 +79,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static DEFINE_SEQLOCK(mnt_ns_tree_lock);

static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */

struct mount_kattr {
	unsigned int attr_set;
@@ -107,17 +108,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
	u64 seq_b = ns->seq;

	if (seq < seq_b)
		return -1;
	if (seq > seq_b)
		return 1;
	return 0;
}

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
	if (!node)
@@ -125,25 +115,52 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}

static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
	u64 seq_a = ns_a->seq;
	u64 seq_b = ns_b->seq;

	return mnt_ns_cmp(seq_a, ns_b) < 0;
	if (seq_a < seq_b)
		return -1;
	if (seq_a > seq_b)
		return 1;
	return 0;
}

static inline void mnt_ns_tree_write_lock(void)
{
	write_seqlock(&mnt_ns_tree_lock);
}

static inline void mnt_ns_tree_write_unlock(void)
{
	write_sequnlock(&mnt_ns_tree_lock);
}

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
	guard(write_lock)(&mnt_ns_tree_lock);
	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
	struct rb_node *node, *prev;

	mnt_ns_tree_write_lock();
	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
	/*
	 * If there's no previous entry simply add it after the
	 * head and if there is add it after the previous entry.
	 */
	prev = rb_prev(&ns->mnt_ns_tree_node);
	if (!prev)
		list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
	else
		list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
	mnt_ns_tree_write_unlock();

	WARN_ON_ONCE(node);
}

static void mnt_ns_release(struct mnt_namespace *ns)
{
	lockdep_assert_not_held(&mnt_ns_tree_lock);

	/* keep alive for {list,stat}mount() */
	if (refcount_dec_and_test(&ns->passive)) {
		put_user_ns(ns->user_ns);
@@ -152,41 +169,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
}

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
	/* remove from global mount namespace list */
	if (!is_anon_ns(ns)) {
		guard(write_lock)(&mnt_ns_tree_lock);
		mnt_ns_tree_write_lock();
		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
		list_bidir_del_rcu(&ns->mnt_ns_list);
		mnt_ns_tree_write_unlock();
	}

	mnt_ns_release(ns);
	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}

/*
 * Returns the mount namespace which either has the specified id, or has the
 * next smallest id afer the specified one.
 */
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
static int mnt_ns_find(const void *key, const struct rb_node *node)
{
	struct rb_node *node = mnt_ns_tree.rb_node;
	struct mnt_namespace *ret = NULL;

	lockdep_assert_held(&mnt_ns_tree_lock);
	const u64 mnt_ns_id = *(u64 *)key;
	const struct mnt_namespace *ns = node_to_mnt_ns(node);

	while (node) {
		struct mnt_namespace *n = node_to_mnt_ns(node);

		if (mnt_ns_id <= n->seq) {
			ret = node_to_mnt_ns(node);
			if (mnt_ns_id == n->seq)
				break;
			node = node->rb_left;
		} else {
			node = node->rb_right;
		}
	}
	return ret;
	if (mnt_ns_id < ns->seq)
		return -1;
	if (mnt_ns_id > ns->seq)
		return 1;
	return 0;
}

/*
@@ -196,16 +206,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 *
 * Note the lookup is lockless protected by a sequence counter. We only
 * need to guard against false negatives as false positives aren't
 * possible. So if we didn't find a mount namespace and the sequence
 * counter has changed we need to retry. If the sequence counter is
 * still the same we know the search actually failed.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
	struct mnt_namespace *ns;
	struct rb_node *node;
	unsigned int seq;

	guard(rcu)();
	do {
		seq = read_seqbegin(&mnt_ns_tree_lock);
		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
		if (node)
			break;
	} while (read_seqretry(&mnt_ns_tree_lock, seq));

       guard(read_lock)(&mnt_ns_tree_lock);
       ns = mnt_ns_find_id_at(mnt_ns_id);
       if (!ns || ns->seq != mnt_ns_id)
	if (!node)
		return NULL;

	/*
	 * The last reference count is put with RCU delay so we can
	 * unconditonally acquire a reference here.
	 */
	ns = node_to_mnt_ns(node);
	refcount_inc(&ns->passive);
	return ns;
}
@@ -237,18 +266,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)

static int mnt_alloc_id(struct mount *mnt)
{
	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
	int res;

	if (res < 0)
	xa_lock(&mnt_id_xa);
	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
	if (!res)
		mnt->mnt_id_unique = ++mnt_id_ctr;
	xa_unlock(&mnt_id_xa);
	return res;
	mnt->mnt_id = res;
	mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
	return 0;
}

static void mnt_free_id(struct mount *mnt)
{
	ida_free(&mnt_id_ida, mnt->mnt_id);
	xa_erase(&mnt_id_xa, mnt->mnt_id);
}

/*
@@ -1125,16 +1155,25 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
	struct rb_node **link = &ns->mounts.rb_node;
	struct rb_node *parent = NULL;
	bool mnt_first_node = true, mnt_last_node = true;

	WARN_ON(mnt_ns_attached(mnt));
	mnt->mnt_ns = ns;
	while (*link) {
		parent = *link;
		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
			link = &parent->rb_left;
		else
			mnt_last_node = false;
		} else {
			link = &parent->rb_right;
			mnt_first_node = false;
		}
	}

	if (mnt_last_node)
		ns->mnt_last_node = &mnt->mnt_node;
	if (mnt_first_node)
		ns->mnt_first_node = &mnt->mnt_node;
	rb_link_node(&mnt->mnt_node, parent, link);
	rb_insert_color(&mnt->mnt_node, &ns->mounts);
}
@@ -2070,30 +2109,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
	return &mnt->ns;
}

struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
	guard(read_lock)(&mnt_ns_tree_lock);
	guard(rcu)();

	for (;;) {
		struct rb_node *node;
		struct list_head *list;

		if (previous)
			node = rb_prev(&mntns->mnt_ns_tree_node);
			list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
		else
			node = rb_next(&mntns->mnt_ns_tree_node);
		if (!node)
			list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
		if (list_is_head(list, &mnt_ns_list))
			return ERR_PTR(-ENOENT);

		mntns = node_to_mnt_ns(node);
		node = &mntns->mnt_ns_tree_node;
		mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);

		/*
		 * The last passive reference count is put with RCU
		 * delay so accessing the mount namespace is not just
		 * safe but all relevant members are still valid.
		 */
		if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
			continue;

		/*
		 * Holding mnt_ns_tree_lock prevents the mount namespace from
		 * being freed but it may well be on it's deathbed. We want an
		 * active reference, not just a passive one here as we're
		 * persisting the mount namespace.
		 * We need an active reference count as we're persisting
		 * the mount namespace and it might already be on its
		 * deathbed.
		 */
		if (!refcount_inc_not_zero(&mntns->ns.count))
			continue;
@@ -3915,6 +3958,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
	refcount_set(&new_ns->ns.count, 1);
	refcount_set(&new_ns->passive, 1);
	new_ns->mounts = RB_ROOT;
	INIT_LIST_HEAD(&new_ns->mnt_ns_list);
	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
	init_waitqueue_head(&new_ns->poll);
	new_ns->user_ns = get_user_ns(user_ns);
@@ -3994,7 +4038,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
		while (p->mnt.mnt_root != q->mnt.mnt_root)
			p = next_mnt(skip_mnt_tree(p), old);
	}
	mnt_ns_tree_add(new_ns);
	namespace_unlock();

	if (rootmnt)
@@ -4002,6 +4045,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
	if (pwdmnt)
		mntput(pwdmnt);

	mnt_ns_tree_add(new_ns);
	return new_ns;
}

@@ -5048,6 +5092,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
	if (sb->s_op->show_options) {
		size_t start = seq->count;

		err = security_sb_show_options(seq, sb);
		if (err)
			return err;

		err = sb->s_op->show_options(seq, mnt->mnt_root);
		if (err)
			return err;
@@ -5535,9 +5583,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,

	if (!last_mnt_id) {
		if (reverse)
			first = node_to_mount(rb_last(&ns->mounts));
			first = node_to_mount(ns->mnt_last_node);
		else
			first = node_to_mount(rb_first(&ns->mounts));
			first = node_to_mount(ns->mnt_first_node);
	} else {
		if (reverse)
			first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
+1 −4
Original line number Diff line number Diff line
@@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
		if (usize < MNT_NS_INFO_SIZE_VER0)
			return -EINVAL;

		if (previous)
			mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
		else
			mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
		mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
		if (IS_ERR(mnt_ns))
			return PTR_ERR(mnt_ns);

+44 −0
Original line number Diff line number Diff line
@@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 * way, we must not access it directly
 */
#define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
/*
 * Return the ->prev pointer of a list_head in an rcu safe way. Don't
 * access it directly.
 *
 * Any list traversed with list_bidir_prev_rcu() must never use
 * list_del_rcu().  Doing so will poison the ->prev pointer that
 * list_bidir_prev_rcu() relies on, which will result in segfaults.
 * To prevent these segfaults, use list_bidir_del_rcu() instead
 * of list_del_rcu().
 */
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
@@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry)
	entry->prev = LIST_POISON2;
}

/**
 * list_bidir_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * In contrast to list_del_rcu() doesn't poison the prev pointer thus
 * allowing backwards traversal via list_bidir_prev_rcu().
 *
 * Note: list_empty() on entry does not return true after this because
 * the entry is in a special undefined state that permits RCU-based
 * lockfree reverse traversal. In particular this means that we can not
 * poison the forward and backwards pointers that may still be used for
 * walking the list.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another list-mutation
 * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
 * this same list. However, it is perfectly legal to run concurrently
 * with the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
 * the same list.
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_bidir_del_rcu(struct list_head *entry)
{
	__list_del_entry(entry);
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
+2 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0-only
/test-fsmount
/test-list-all-mounts
/test-statx
/mountinfo
Loading