mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-05 00:07:48 -04:00
pid: only take pidmap_lock once on alloc
When spawning and killing threads in separate processes in parallel the primary bottleneck on the stock kernel is pidmap_lock, largely because of a back-to-back acquire in the common case. This aspect is fixed with the patch. Performance improvement varies between reboots. When benchmarking with 20 processes creating and killing threads in a loop, the unpatched baseline hovers around 465k ops/s, while patched is anything between ~510k ops/s and ~560k depending on false-sharing (which I only minimally sanitized). So this is at least 10% if you are unlucky. The change also facilitated some cosmetic fixes. It has an unintentional side effect of no longer issuing spurious idr_preload() around idr_replace(). Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
committed by
Christian Brauner
parent
c0aac5975b
commit
6d864a1b18
131
kernel/pid.c
131
kernel/pid.c
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
|
||||
free_pid(pids[tmp]);
|
||||
}
|
||||
|
||||
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
size_t set_tid_size)
|
||||
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
|
||||
size_t arg_set_tid_size)
|
||||
{
|
||||
int set_tid[MAX_PID_NS_LEVEL + 1] = {};
|
||||
int pid_max[MAX_PID_NS_LEVEL + 1] = {};
|
||||
struct pid *pid;
|
||||
enum pid_type type;
|
||||
int i, nr;
|
||||
struct pid_namespace *tmp;
|
||||
struct upid *upid;
|
||||
int retval = -ENOMEM;
|
||||
bool retried_preload;
|
||||
|
||||
/*
|
||||
* set_tid_size contains the size of the set_tid array. Starting at
|
||||
* arg_set_tid_size contains the size of the arg_set_tid array. Starting at
|
||||
* the most nested currently active PID namespace it tells alloc_pid()
|
||||
* which PID to set for a process in that most nested PID namespace
|
||||
* up to set_tid_size PID namespaces. It does not have to set the PID
|
||||
* for a process in all nested PID namespaces but set_tid_size must
|
||||
* up to arg_set_tid_size PID namespaces. It does not have to set the PID
|
||||
* for a process in all nested PID namespaces but arg_set_tid_size must
|
||||
* never be greater than the current ns->level + 1.
|
||||
*/
|
||||
if (set_tid_size > ns->level + 1)
|
||||
if (arg_set_tid_size > ns->level + 1)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
* Prep before we take locks:
|
||||
*
|
||||
* 1. allocate and fill in pid struct
|
||||
*/
|
||||
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
|
||||
if (!pid)
|
||||
return ERR_PTR(retval);
|
||||
|
||||
tmp = ns;
|
||||
get_pid_ns(ns);
|
||||
pid->level = ns->level;
|
||||
refcount_set(&pid->count, 1);
|
||||
spin_lock_init(&pid->lock);
|
||||
for (type = 0; type < PIDTYPE_MAX; ++type)
|
||||
INIT_HLIST_HEAD(&pid->tasks[type]);
|
||||
init_waitqueue_head(&pid->wait_pidfd);
|
||||
INIT_HLIST_HEAD(&pid->inodes);
|
||||
|
||||
for (i = ns->level; i >= 0; i--) {
|
||||
int tid = 0;
|
||||
int pid_max = READ_ONCE(tmp->pid_max);
|
||||
/*
|
||||
* 2. perm check checkpoint_restore_ns_capable()
|
||||
*
|
||||
* This stores found pid_max to make sure the used value is the same should
|
||||
* later code need it.
|
||||
*/
|
||||
for (tmp = ns, i = ns->level; i >= 0; i--) {
|
||||
pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
|
||||
|
||||
if (set_tid_size) {
|
||||
tid = set_tid[ns->level - i];
|
||||
if (arg_set_tid_size) {
|
||||
int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
|
||||
|
||||
retval = -EINVAL;
|
||||
if (tid < 1 || tid >= pid_max)
|
||||
goto out_free;
|
||||
if (tid < 1 || tid >= pid_max[ns->level - i])
|
||||
goto out_abort;
|
||||
/*
|
||||
* Also fail if a PID != 1 is requested and
|
||||
* no PID 1 exists.
|
||||
*/
|
||||
if (tid != 1 && !tmp->child_reaper)
|
||||
goto out_free;
|
||||
goto out_abort;
|
||||
retval = -EPERM;
|
||||
if (!checkpoint_restore_ns_capable(tmp->user_ns))
|
||||
goto out_free;
|
||||
set_tid_size--;
|
||||
goto out_abort;
|
||||
arg_set_tid_size--;
|
||||
}
|
||||
|
||||
idr_preload(GFP_KERNEL);
|
||||
spin_lock(&pidmap_lock);
|
||||
tmp = tmp->parent;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prep is done, id allocation goes here:
|
||||
*/
|
||||
retried_preload = false;
|
||||
idr_preload(GFP_KERNEL);
|
||||
spin_lock(&pidmap_lock);
|
||||
for (tmp = ns, i = ns->level; i >= 0;) {
|
||||
int tid = set_tid[ns->level - i];
|
||||
|
||||
if (tid) {
|
||||
nr = idr_alloc(&tmp->idr, NULL, tid,
|
||||
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
* alreay in use. Return EEXIST in that case.
|
||||
*/
|
||||
if (nr == -ENOSPC)
|
||||
|
||||
nr = -EEXIST;
|
||||
} else {
|
||||
int pid_min = 1;
|
||||
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
* a partially initialized PID (see below).
|
||||
*/
|
||||
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
||||
pid_max, GFP_ATOMIC);
|
||||
pid_max[ns->level - i], GFP_ATOMIC);
|
||||
if (nr == -ENOSPC)
|
||||
nr = -EAGAIN;
|
||||
}
|
||||
spin_unlock(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
|
||||
if (nr < 0) {
|
||||
retval = (nr == -ENOSPC) ? -EAGAIN : nr;
|
||||
if (unlikely(nr < 0)) {
|
||||
/*
|
||||
* Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
|
||||
*
|
||||
* The IDR API only allows us to preload memory for one call, while we may end
|
||||
* up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
|
||||
* salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
|
||||
* did not help (the routine unfortunately returns void, so we have no idea
|
||||
* if it got anywhere).
|
||||
*
|
||||
* The lock can be safely dropped and picked up as historically pid allocation
|
||||
* for different namespaces was *not* atomic -- we try to hold on to it the
|
||||
* entire time only for performance reasons.
|
||||
*/
|
||||
if (nr == -ENOMEM && !retried_preload) {
|
||||
spin_unlock(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
retried_preload = true;
|
||||
idr_preload(GFP_KERNEL);
|
||||
spin_lock(&pidmap_lock);
|
||||
continue;
|
||||
}
|
||||
retval = nr;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
pid->numbers[i].nr = nr;
|
||||
pid->numbers[i].ns = tmp;
|
||||
tmp = tmp->parent;
|
||||
i--;
|
||||
retried_preload = false;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
* is what we have exposed to userspace for a long time and it is
|
||||
* documented behavior for pid namespaces. So we can't easily
|
||||
* change it even if there were an error code better suited.
|
||||
*
|
||||
* This can't be done earlier because we need to preserve other
|
||||
* error conditions.
|
||||
*/
|
||||
retval = -ENOMEM;
|
||||
|
||||
get_pid_ns(ns);
|
||||
refcount_set(&pid->count, 1);
|
||||
spin_lock_init(&pid->lock);
|
||||
for (type = 0; type < PIDTYPE_MAX; ++type)
|
||||
INIT_HLIST_HEAD(&pid->tasks[type]);
|
||||
|
||||
init_waitqueue_head(&pid->wait_pidfd);
|
||||
INIT_HLIST_HEAD(&pid->inodes);
|
||||
|
||||
upid = pid->numbers + ns->level;
|
||||
idr_preload(GFP_KERNEL);
|
||||
spin_lock(&pidmap_lock);
|
||||
if (!(ns->pid_allocated & PIDNS_ADDING))
|
||||
goto out_unlock;
|
||||
if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
|
||||
goto out_free;
|
||||
pidfs_add_pid(pid);
|
||||
for ( ; upid >= pid->numbers; --upid) {
|
||||
for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
|
||||
/* Make the PID visible to find_pid_ns. */
|
||||
idr_replace(&upid->ns->idr, pid, upid->nr);
|
||||
upid->ns->pid_allocated++;
|
||||
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
|
||||
return pid;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
put_pid_ns(ns);
|
||||
|
||||
out_free:
|
||||
spin_lock(&pidmap_lock);
|
||||
while (++i <= ns->level) {
|
||||
upid = pid->numbers + i;
|
||||
idr_remove(&upid->ns->idr, upid->nr);
|
||||
@@ -303,7 +339,10 @@ out_free:
|
||||
idr_set_cursor(&ns->idr, 0);
|
||||
|
||||
spin_unlock(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
|
||||
out_abort:
|
||||
put_pid_ns(ns);
|
||||
kmem_cache_free(ns->pid_cachep, pid);
|
||||
return ERR_PTR(retval);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user