Unverified Commit ae901e5e authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "ns: fixes for namespace iteration and active reference counting"

Christian Brauner <brauner@kernel.org> says:

* Make sure to initialize the active reference count for the initial
  network namespace and prevent __ns_common_init() from returning too
  early.

* Make sure that passive reference counts are dropped outside of rcu
  read locks as some namespaces such as the mount namespace do in fact
  sleep when putting the last reference.

* The setns() system call supports:

  (1) namespace file descriptors (nsfd)
  (2) process file descriptors (pidfd)

  When using nsfds the namespaces will remain active because they are
  pinned by the vfs. However, when pidfds are used things are more
  complicated.

  When the target task exits and passes through exit_nsproxy_namespaces()
  or is reaped and thus also passes through exit_cred_namespaces() after
  the setns()'ing task has called prepare_nsset() but before the active
  reference count of the set of namespaces it wants to setns() to might
  have been dropped already:

    P1                                                              P2

    pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                    pidfd = pidfd_open(pid_p1)
                                                                    setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
                                                                    prepare_nsset()

    exit(0)
    // ns->__ns_active_ref        == 1
    // parent_ns->__ns_active_ref == 1
    -> exit_nsproxy_namespaces()
    -> exit_cred_namespaces()

    // ns_active_ref_put() will also put
    // the reference on the owner of the
    // namespace. If the only reason the
    // owning namespace was alive was
    // because it was a parent of @ns
    // it's active reference count now goes
    // to zero... --------------------------------
    //                                           |
    // ns->__ns_active_ref        == 0           |
    // parent_ns->__ns_active_ref == 0           |
                                                 |                  commit_nsset()
                                                 -----------------> // If setns()
                                                                    // now manages to install the namespaces
                                                                    // it will call ns_active_ref_get()
                                                                    // on them thus bumping the active reference
                                                                    // count from zero again but without also
                                                                    // taking the required reference on the owner.
                                                                    // Thus we get:
                                                                    //
                                                                    // ns->__ns_active_ref        == 1
                                                                    // parent_ns->__ns_active_ref == 0

    When later someone does ns_active_ref_put() on @ns it will underflow
    parent_ns->__ns_active_ref leading to a splat from our asserts
    thinking there are still active references when in fact the counter
    just underflowed.

  So resurrect the ownership chain if necessary as well. If the caller
  succeeded to grab passive references to the set of namespaces the
  setns() should simply succeed even if the target task exists or gets
  reaped in the meantime.

  The race is rare and can only be triggered when using pidfs to setns()
  to namespaces. Also note that active reference on initial namespaces are
  nops.

  Since we now always handle parent references directly we can drop
  ns_ref_active_get_owner() when adding a namespace to a namespace tree.
  This is now all handled uniformly in the places where the new namespaces
  actually become active.

* patches from https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org:
  selftests/namespaces: test for efault
  selftests/namespaces: add active reference count regression test
  ns: add asserts for active refcount underflow
  ns: handle setns(pidfd, ...) cleanly
  ns: return EFAULT on put_user() error
  ns: make sure reference are dropped outside of rcu lock
  ns: don't increment or decrement initial namespaces
  ns: don't skip active reference count initialization

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 8ebfb989 07d7ad46
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -430,7 +430,7 @@ static int nsfs_init_inode(struct inode *inode, void *data)
	 * ioctl on such a socket will resurrect the relevant namespace
	 * subtree.
	 */
	__ns_ref_active_resurrect(ns);
	__ns_ref_active_get(ns);
	return 0;
}

+13 −36
Original line number Diff line number Diff line
@@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns)
				 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1));
}

static __always_inline bool is_ns_init_id(const struct ns_common *ns)
{
	VFS_WARN_ON_ONCE(ns->ns_id == 0);
	return ns->ns_id <= NS_LAST_INIT_ID;
}

#define to_ns_common(__ns)                                    \
	_Generic((__ns),                                      \
		struct cgroup_namespace *:       &(__ns)->ns, \
@@ -281,54 +287,25 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns
#define ns_ref_active_read(__ns) \
	((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0)

void __ns_ref_active_get_owner(struct ns_common *ns);

static __always_inline void __ns_ref_active_get(struct ns_common *ns)
{
	WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
	VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0);
}
#define ns_ref_active_get(__ns) \
	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)

static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns)
{
	if (atomic_inc_not_zero(&ns->__ns_ref_active)) {
		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
		return true;
	}
	return false;
}

#define ns_ref_active_get_owner(__ns) \
	do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0)

void __ns_ref_active_put_owner(struct ns_common *ns);
void __ns_ref_active_put(struct ns_common *ns);

static __always_inline void __ns_ref_active_put(struct ns_common *ns)
{
	if (atomic_dec_and_test(&ns->__ns_ref_active)) {
		VFS_WARN_ON_ONCE(is_initial_namespace(ns));
		VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
		__ns_ref_active_put_owner(ns);
	}
}
#define ns_ref_active_put(__ns) \
	do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0)

static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns)
{
	VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns));
	if (!__ns_ref_active_read(ns))
	if (!__ns_ref_active_read(ns)) {
		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		return NULL;
	}
	if (!__ns_ref_get(ns))
		return NULL;
	return ns;
}

void __ns_ref_active_resurrect(struct ns_common *ns);
void __ns_ref_active_get(struct ns_common *ns);

#define ns_ref_active_resurrect(__ns) \
	do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0)
#define ns_ref_active_get(__ns) \
	do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0)

#endif
+35 −17
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)

int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
	int ret;
	int ret = 0;

	refcount_set(&ns->__ns_ref, 1);
	ns->stashed = NULL;
@@ -74,10 +74,9 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope
	ns_debug(ns, ops);
#endif

	if (inum) {
	if (inum)
		ns->inum = inum;
		return 0;
	}
	else
		ret = proc_alloc_inum(&ns->inum);
	if (ret)
		return ret;
@@ -115,13 +114,6 @@ struct ns_common *__must_check ns_owner(struct ns_common *ns)
	return to_ns_common(owner);
}

void __ns_ref_active_get_owner(struct ns_common *ns)
{
	ns = ns_owner(ns);
	if (ns)
		WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
}

/*
 * The active reference count works by having each namespace that gets
 * created take a single active reference on its owning user namespace.
@@ -172,16 +164,31 @@ void __ns_ref_active_get_owner(struct ns_common *ns)
 * The iteration stops once we reach a namespace that still has active
 * references.
 */
void __ns_ref_active_put_owner(struct ns_common *ns)
void __ns_ref_active_put(struct ns_common *ns)
{
	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
		VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
		return;
	}

	VFS_WARN_ON_ONCE(is_ns_init_id(ns));
	VFS_WARN_ON_ONCE(!__ns_ref_read(ns));

	for (;;) {
		ns = ns_owner(ns);
		if (!ns)
			return;
		if (!atomic_dec_and_test(&ns->__ns_ref_active))
		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
			VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
			return;
		}
	}
}

/*
 * The active reference count works by having each namespace that gets
@@ -275,10 +282,18 @@ void __ns_ref_active_put_owner(struct ns_common *ns)
 * it also needs to take another reference on its owning user namespace
 * and so on.
 */
void __ns_ref_active_resurrect(struct ns_common *ns)
void __ns_ref_active_get(struct ns_common *ns)
{
	int prev;

	/* Initial namespaces are always active. */
	if (is_ns_init_id(ns))
		return;

	/* If we didn't resurrect the namespace we're done. */
	if (atomic_fetch_add(1, &ns->__ns_ref_active))
	prev = atomic_fetch_add(1, &ns->__ns_ref_active);
	VFS_WARN_ON_ONCE(prev < 0);
	if (likely(prev))
		return;

	/*
@@ -290,7 +305,10 @@ void __ns_ref_active_resurrect(struct ns_common *ns)
		if (!ns)
			return;

		if (atomic_fetch_add(1, &ns->__ns_ref_active))
		VFS_WARN_ON_ONCE(is_ns_init_id(ns));
		prev = atomic_fetch_add(1, &ns->__ns_ref_active);
		VFS_WARN_ON_ONCE(prev < 0);
		if (likely(prev))
			return;
	}
}
+25 −19
Original line number Diff line number Diff line
@@ -173,14 +173,6 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
	write_sequnlock(&ns_tree_lock);

	VFS_WARN_ON_ONCE(node);

	/*
	 * Take an active reference on the owner namespace. This ensures
	 * that the owner remains visible while any of its child namespaces
	 * are active. For init namespaces this is a no-op as ns_owner()
	 * returns NULL for namespaces owned by init_user_ns.
	 */
	__ns_ref_active_get_owner(ns);
}

void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
@@ -505,13 +497,13 @@ static inline bool __must_check may_list_ns(const struct klistns *kls,
	return false;
}

static void __ns_put(struct ns_common *ns)
static inline void ns_put(struct ns_common *ns)
{
	if (ns->ops)
	if (ns && ns->ops)
		ns->ops->put(ns);
}

DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T))
DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T))

static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls,
							   struct ns_common *candidate)
@@ -535,7 +527,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
{
	u64 __user *ns_ids = kls->uns_ids;
	size_t nr_ns_ids = kls->nr_ns_ids;
	struct ns_common *ns = NULL, *first_ns = NULL;
	struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL;
	const struct list_head *head;
	ssize_t ret;

@@ -568,9 +560,10 @@ static ssize_t do_listns_userns(struct klistns *kls)

	if (!first_ns)
		first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry);

	for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids;
	     ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) {
		struct ns_common *valid __free(ns_put);
		struct ns_common *valid;

		valid = legitimize_ns(kls, ns);
		if (!valid)
@@ -578,8 +571,14 @@ static ssize_t do_listns_userns(struct klistns *kls)

		rcu_read_unlock();

		if (put_user(valid->ns_id, ns_ids + ret))
			return -EINVAL;
		ns_put(prev);
		prev = valid;

		if (put_user(valid->ns_id, ns_ids + ret)) {
			ns_put(prev);
			return -EFAULT;
		}

		nr_ns_ids--;
		ret++;

@@ -587,6 +586,7 @@ static ssize_t do_listns_userns(struct klistns *kls)
	}

	rcu_read_unlock();
	ns_put(prev);
	return ret;
}

@@ -668,7 +668,7 @@ static ssize_t do_listns(struct klistns *kls)
{
	u64 __user *ns_ids = kls->uns_ids;
	size_t nr_ns_ids = kls->nr_ns_ids;
	struct ns_common *ns, *first_ns = NULL;
	struct ns_common *ns, *first_ns = NULL, *prev = NULL;
	struct ns_tree *ns_tree = NULL;
	const struct list_head *head;
	u32 ns_type;
@@ -705,7 +705,7 @@ static ssize_t do_listns(struct klistns *kls)

	for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids;
	     ns = next_ns_common(ns, ns_tree)) {
		struct ns_common *valid __free(ns_put);
		struct ns_common *valid;

		valid = legitimize_ns(kls, ns);
		if (!valid)
@@ -713,8 +713,13 @@ static ssize_t do_listns(struct klistns *kls)

		rcu_read_unlock();

		if (put_user(valid->ns_id, ns_ids + ret))
			return -EINVAL;
		ns_put(prev);
		prev = valid;

		if (put_user(valid->ns_id, ns_ids + ret)) {
			ns_put(prev);
			return -EFAULT;
		}

		nr_ns_ids--;
		ret++;
@@ -723,6 +728,7 @@ static ssize_t do_listns(struct klistns *kls)
	}

	rcu_read_unlock();
	ns_put(prev);
	return ret;
}

+2 −0
Original line number Diff line number Diff line
@@ -4,7 +4,9 @@ init_ino_test
ns_active_ref_test
listns_test
listns_permissions_test
listns_efault_test
siocgskns_test
cred_change_test
stress_test
listns_pagination_bug
regression_pidfd_setns_test
Loading