Unverified Commit 6d864a1b authored by Mateusz Guzik's avatar Mateusz Guzik Committed by Christian Brauner
Browse files

pid: only take pidmap_lock once on alloc



When spawning and killing threads in separate processes in parallel the
primary bottleneck on the stock kernel is pidmap_lock, largely because
of a back-to-back acquire in the common case. This aspect is fixed with
the patch.

Performance improvement varies between reboots. When benchmarking with
20 processes creating and killing threads in a loop, the unpatched
baseline hovers around 465k ops/s, while patched is anything between
~510k ops/s and ~560k depending on false-sharing (which I only minimally
sanitized). So this is at least 10% if you are unlucky.

The change also facilitated some cosmetic fixes.

It has an unintentional side effect of no longer issuing spurious
idr_preload() around idr_replace().

Signed-off-by: default avatarMateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com


Reviewed-by: default avatarOleg Nesterov <oleg@redhat.com>
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent c0aac597
Loading
Loading
Loading
Loading
+85 −46
Original line number Diff line number Diff line
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
			free_pid(pids[tmp]);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
		      size_t set_tid_size)
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
		      size_t arg_set_tid_size)
{
	int set_tid[MAX_PID_NS_LEVEL + 1] = {};
	int pid_max[MAX_PID_NS_LEVEL + 1] = {};
	struct pid *pid;
	enum pid_type type;
	int i, nr;
	struct pid_namespace *tmp;
	struct upid *upid;
	int retval = -ENOMEM;
	bool retried_preload;

	/*
	 * set_tid_size contains the size of the set_tid array. Starting at
	 * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
	 * the most nested currently active PID namespace it tells alloc_pid()
	 * which PID to set for a process in that most nested PID namespace
	 * up to set_tid_size PID namespaces. It does not have to set the PID
	 * for a process in all nested PID namespaces but set_tid_size must
	 * up to arg_set_tid_size PID namespaces. It does not have to set the PID
	 * for a process in all nested PID namespaces but arg_set_tid_size must
	 * never be greater than the current ns->level + 1.
	 */
	if (set_tid_size > ns->level + 1)
	if (arg_set_tid_size > ns->level + 1)
		return ERR_PTR(-EINVAL);

	/*
	 * Prep before we take locks:
	 *
	 * 1. allocate and fill in pid struct
	 */
	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
	if (!pid)
		return ERR_PTR(retval);

	tmp = ns;
	get_pid_ns(ns);
	pid->level = ns->level;
	refcount_set(&pid->count, 1);
	spin_lock_init(&pid->lock);
	for (type = 0; type < PIDTYPE_MAX; ++type)
		INIT_HLIST_HEAD(&pid->tasks[type]);
	init_waitqueue_head(&pid->wait_pidfd);
	INIT_HLIST_HEAD(&pid->inodes);

	for (i = ns->level; i >= 0; i--) {
		int tid = 0;
		int pid_max = READ_ONCE(tmp->pid_max);
	/*
	 * 2. perm check checkpoint_restore_ns_capable()
	 *
	 * This stores found pid_max to make sure the used value is the same should
	 * later code need it.
	 */
	for (tmp = ns, i = ns->level; i >= 0; i--) {
		pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);

		if (set_tid_size) {
			tid = set_tid[ns->level - i];
		if (arg_set_tid_size) {
			int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];

			retval = -EINVAL;
			if (tid < 1 || tid >= pid_max)
				goto out_free;
			if (tid < 1 || tid >= pid_max[ns->level - i])
				goto out_abort;
			/*
			 * Also fail if a PID != 1 is requested and
			 * no PID 1 exists.
			 */
			if (tid != 1 && !tmp->child_reaper)
				goto out_free;
				goto out_abort;
			retval = -EPERM;
			if (!checkpoint_restore_ns_capable(tmp->user_ns))
				goto out_free;
			set_tid_size--;
				goto out_abort;
			arg_set_tid_size--;
		}

		tmp = tmp->parent;
	}

	/*
	 * Prep is done, id allocation goes here:
	 */
	retried_preload = false;
	idr_preload(GFP_KERNEL);
	spin_lock(&pidmap_lock);
	for (tmp = ns, i = ns->level; i >= 0;) {
		int tid = set_tid[ns->level - i];

		if (tid) {
			nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
			 * alreay in use. Return EEXIST in that case.
			 */
			if (nr == -ENOSPC)

				nr = -EEXIST;
		} else {
			int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
			 * a partially initialized PID (see below).
			 */
			nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
					      pid_max, GFP_ATOMIC);
					      pid_max[ns->level - i], GFP_ATOMIC);
			if (nr == -ENOSPC)
				nr = -EAGAIN;
		}

		if (unlikely(nr < 0)) {
			/*
			 * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
			 *
			 * The IDR API only allows us to preload memory for one call, while we may end
			 * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
			 * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
			 * did not help (the routine unfortunately returns void, so we have no idea
			 * if it got anywhere).
			 *
			 * The lock can be safely dropped and picked up as historically pid allocation
			 * for different namespaces was *not* atomic -- we try to hold on to it the
			 * entire time only for performance reasons.
			 */
			if (nr == -ENOMEM && !retried_preload) {
				spin_unlock(&pidmap_lock);
				idr_preload_end();

		if (nr < 0) {
			retval = (nr == -ENOSPC) ? -EAGAIN : nr;
				retried_preload = true;
				idr_preload(GFP_KERNEL);
				spin_lock(&pidmap_lock);
				continue;
			}
			retval = nr;
			goto out_free;
		}

		pid->numbers[i].nr = nr;
		pid->numbers[i].ns = tmp;
		tmp = tmp->parent;
		i--;
		retried_preload = false;
	}

	/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
	 * is what we have exposed to userspace for a long time and it is
	 * documented behavior for pid namespaces. So we can't easily
	 * change it even if there were an error code better suited.
	 *
	 * This can't be done earlier because we need to preserve other
	 * error conditions.
	 */
	retval = -ENOMEM;

	get_pid_ns(ns);
	refcount_set(&pid->count, 1);
	spin_lock_init(&pid->lock);
	for (type = 0; type < PIDTYPE_MAX; ++type)
		INIT_HLIST_HEAD(&pid->tasks[type]);

	init_waitqueue_head(&pid->wait_pidfd);
	INIT_HLIST_HEAD(&pid->inodes);

	upid = pid->numbers + ns->level;
	idr_preload(GFP_KERNEL);
	spin_lock(&pidmap_lock);
	if (!(ns->pid_allocated & PIDNS_ADDING))
		goto out_unlock;
	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
		goto out_free;
	pidfs_add_pid(pid);
	for ( ; upid >= pid->numbers; --upid) {
	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
		/* Make the PID visible to find_pid_ns. */
		idr_replace(&upid->ns->idr, pid, upid->nr);
		upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,

	return pid;

out_unlock:
	spin_unlock(&pidmap_lock);
	idr_preload_end();
	put_pid_ns(ns);

out_free:
	spin_lock(&pidmap_lock);
	while (++i <= ns->level) {
		upid = pid->numbers + i;
		idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
		idr_set_cursor(&ns->idr, 0);

	spin_unlock(&pidmap_lock);
	idr_preload_end();

out_abort:
	put_pid_ns(ns);
	kmem_cache_free(ns->pid_cachep, pid);
	return ERR_PTR(retval);
}