pid: only take pidmap_lock once on alloc

When spawning and killing threads in separate processes in parallel the
primary bottleneck on the stock kernel is pidmap_lock, largely because
of a back-to-back acquire in the common case. This aspect is fixed with
the patch.

Performance improvement varies between reboots. When benchmarking with
20 processes creating and killing threads in a loop, the unpatched
baseline hovers around 465k ops/s, while patched is anything between
~510k ops/s and ~560k depending on false-sharing (which I only minimally
sanitized). So this is at least 10% if you are unlucky.

The change also facilitated some cosmetic fixes.

It has an unintentional side effect of no longer issuing spurious
idr_preload() around idr_replace().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Mateusz Guzik
2025-12-03 10:28:51 +01:00
committed by Christian Brauner
parent c0aac5975b
commit 6d864a1b18

View File

@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
free_pid(pids[tmp]);
}
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
size_t arg_set_tid_size)
{
int set_tid[MAX_PID_NS_LEVEL + 1] = {};
int pid_max[MAX_PID_NS_LEVEL + 1] = {};
struct pid *pid;
enum pid_type type;
int i, nr;
struct pid_namespace *tmp;
struct upid *upid;
int retval = -ENOMEM;
bool retried_preload;
/*
* set_tid_size contains the size of the set_tid array. Starting at
* arg_set_tid_size contains the size of the arg_set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
* up to set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but set_tid_size must
* up to arg_set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but arg_set_tid_size must
* never be greater than the current ns->level + 1.
*/
if (set_tid_size > ns->level + 1)
if (arg_set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
/*
* Prep before we take locks:
*
* 1. allocate and fill in pid struct
*/
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
tmp = ns;
get_pid_ns(ns);
pid->level = ns->level;
refcount_set(&pid->count, 1);
spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
for (i = ns->level; i >= 0; i--) {
int tid = 0;
int pid_max = READ_ONCE(tmp->pid_max);
/*
* 2. perm check checkpoint_restore_ns_capable()
*
* This stores found pid_max to make sure the used value is the same should
* later code need it.
*/
for (tmp = ns, i = ns->level; i >= 0; i--) {
pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
if (arg_set_tid_size) {
int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
retval = -EINVAL;
if (tid < 1 || tid >= pid_max)
goto out_free;
if (tid < 1 || tid >= pid_max[ns->level - i])
goto out_abort;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_free;
goto out_abort;
retval = -EPERM;
if (!checkpoint_restore_ns_capable(tmp->user_ns))
goto out_free;
set_tid_size--;
goto out_abort;
arg_set_tid_size--;
}
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
tmp = tmp->parent;
}
/*
* Prep is done, id allocation goes here:
*/
retried_preload = false;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
for (tmp = ns, i = ns->level; i >= 0;) {
int tid = set_tid[ns->level - i];
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
pid_max[ns->level - i], GFP_ATOMIC);
if (nr == -ENOSPC)
nr = -EAGAIN;
}
spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
if (unlikely(nr < 0)) {
/*
* Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
*
* The IDR API only allows us to preload memory for one call, while we may end
* up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
* salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
* did not help (the routine unfortunately returns void, so we have no idea
* if it got anywhere).
*
* The lock can be safely dropped and picked up as historically pid allocation
* for different namespaces was *not* atomic -- we try to hold on to it the
* entire time only for performance reasons.
*/
if (nr == -ENOMEM && !retried_preload) {
spin_unlock(&pidmap_lock);
idr_preload_end();
retried_preload = true;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
continue;
}
retval = nr;
goto out_free;
}
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
i--;
retried_preload = false;
}
/*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
* is what we have exposed to userspace for a long time and it is
* documented behavior for pid namespaces. So we can't easily
* change it even if there were an error code better suited.
*
* This can't be done earlier because we need to preserve other
* error conditions.
*/
retval = -ENOMEM;
get_pid_ns(ns);
refcount_set(&pid->count, 1);
spin_lock_init(&pid->lock);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
goto out_free;
pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
return pid;
out_unlock:
spin_unlock(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns);
out_free:
spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ out_free:
idr_set_cursor(&ns->idr, 0);
spin_unlock(&pidmap_lock);
idr_preload_end();
out_abort:
put_pid_ns(ns);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
}