Unverified Commit 5dcbd85d authored by Christian Brauner's avatar Christian Brauner
Browse files

fs: lockless mntns rbtree lookup

Currently we use a read-write lock but for the simple search case we can
make this lockless. Creating a new mount namespace is a rather rare
event compared with querying mounts in a foreign mount namespace. Once
this is picked up by e.g., systemd to list mounts in another mount in
it's isolated services or in containers this will be used a lot so this
seems worthwhile doing.

Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-3-6e3cdaf9b280@kernel.org


Reviewed-by: default avatarJeff Layton <jlayton@kernel.org>
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent 144acef3
Loading
Loading
Loading
Loading
+4 −1
Original line number Diff line number Diff line
@@ -12,7 +12,10 @@ struct mnt_namespace {
	struct user_namespace	*user_ns;
	struct ucounts		*ucounts;
	u64			seq;	/* Sequence number to prevent loops */
	union {
		wait_queue_head_t	poll;
		struct rcu_head		mnt_ns_rcu;
	};
	u64 event;
	unsigned int		nr_mounts; /* # of mounts in the namespace */
	unsigned int		pending_mounts;
+70 −46
Original line number Diff line number Diff line
@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);

static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */

struct mount_kattr {
@@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
	u64 seq_b = ns->seq;

	if (seq < seq_b)
		return -1;
	if (seq > seq_b)
		return 1;
	return 0;
}

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
	if (!node)
@@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}

static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
	u64 seq_a = ns_a->seq;
	u64 seq_b = ns_b->seq;

	if (seq_a < seq_b)
		return -1;
	if (seq_a > seq_b)
		return 1;
	return 0;
}

static inline void mnt_ns_tree_write_lock(void)
{
	write_lock(&mnt_ns_tree_lock);
	write_seqcount_begin(&mnt_ns_tree_seqcount);
}

	return mnt_ns_cmp(seq_a, ns_b) < 0;
static inline void mnt_ns_tree_write_unlock(void)
{
	write_seqcount_end(&mnt_ns_tree_seqcount);
	write_unlock(&mnt_ns_tree_lock);
}

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
	guard(write_lock)(&mnt_ns_tree_lock);
	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
	struct rb_node *node;

	mnt_ns_tree_write_lock();
	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
	mnt_ns_tree_write_unlock();

	WARN_ON_ONCE(node);
}

static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
}

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
	/* remove from global mount namespace list */
	if (!is_anon_ns(ns)) {
		guard(write_lock)(&mnt_ns_tree_lock);
		mnt_ns_tree_write_lock();
		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
		mnt_ns_tree_write_unlock();
	}

	mnt_ns_release(ns);
	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}

/*
 * Returns the mount namespace which either has the specified id, or has the
 * next smallest id afer the specified one.
 */
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
static int mnt_ns_find(const void *key, const struct rb_node *node)
{
	struct rb_node *node = mnt_ns_tree.rb_node;
	struct mnt_namespace *ret = NULL;

	lockdep_assert_held(&mnt_ns_tree_lock);

	while (node) {
		struct mnt_namespace *n = node_to_mnt_ns(node);
	const u64 mnt_ns_id = *(u64 *)key;
	const struct mnt_namespace *ns = node_to_mnt_ns(node);

		if (mnt_ns_id <= n->seq) {
			ret = node_to_mnt_ns(node);
			if (mnt_ns_id == n->seq)
				break;
			node = node->rb_left;
		} else {
			node = node->rb_right;
		}
	}
	return ret;
	if (mnt_ns_id < ns->seq)
		return -1;
	if (mnt_ns_id > ns->seq)
		return 1;
	return 0;
}

/*
@@ -194,16 +199,35 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 *
 * Note the lookup is lockless protected by a sequence counter. We only
 * need to guard against false negatives as false positives aren't
 * possible. So if we didn't find a mount namespace and the sequence
 * counter has changed we need to retry. If the sequence counter is
 * still the same we know the search actually failed.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
	struct mnt_namespace *ns;
	struct rb_node *node;
	unsigned int seq;

       guard(read_lock)(&mnt_ns_tree_lock);
       ns = mnt_ns_find_id_at(mnt_ns_id);
       if (!ns || ns->seq != mnt_ns_id)
	guard(rcu)();
	do {
		seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
		if (node)
			break;
	} while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));

	if (!node)
		return NULL;

	/*
	 * The last reference count is put with RCU delay so we can
	 * unconditonally acquire a reference here.
	 */
	ns = node_to_mnt_ns(node);
	refcount_inc(&ns->passive);
	return ns;
}