pid: only take pidmap_lock once on alloc (6d864a1b) · Commits · git / linux-net

kernel/pid.c

+85 −46

Original line number	Diff line number	Diff line
		@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
		free_pid(pids[tmp]);
		}

		struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,
		size_t set_tid_size)
		struct pid alloc_pid(struct pid_namespace ns, pid_t *arg_set_tid,
		size_t arg_set_tid_size)
		{
		int set_tid[MAX_PID_NS_LEVEL + 1] = {};
		int pid_max[MAX_PID_NS_LEVEL + 1] = {};
		struct pid *pid;
		enum pid_type type;
		int i, nr;
		struct pid_namespace *tmp;
		struct upid *upid;
		int retval = -ENOMEM;
		bool retried_preload;

		/*
		* set_tid_size contains the size of the set_tid array. Starting at
		* arg_set_tid_size contains the size of the arg_set_tid array. Starting at
		* the most nested currently active PID namespace it tells alloc_pid()
		* which PID to set for a process in that most nested PID namespace
		* up to set_tid_size PID namespaces. It does not have to set the PID
		* for a process in all nested PID namespaces but set_tid_size must
		* up to arg_set_tid_size PID namespaces. It does not have to set the PID
		* for a process in all nested PID namespaces but arg_set_tid_size must
		* never be greater than the current ns->level + 1.
		*/
		if (set_tid_size > ns->level + 1)
		if (arg_set_tid_size > ns->level + 1)
		return ERR_PTR(-EINVAL);

		/*
		* Prep before we take locks:
		*
		* 1. allocate and fill in pid struct
		*/
		pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
		if (!pid)
		return ERR_PTR(retval);

		tmp = ns;
		get_pid_ns(ns);
		pid->level = ns->level;
		refcount_set(&pid->count, 1);
		spin_lock_init(&pid->lock);
		for (type = 0; type < PIDTYPE_MAX; ++type)
		INIT_HLIST_HEAD(&pid->tasks[type]);
		init_waitqueue_head(&pid->wait_pidfd);
		INIT_HLIST_HEAD(&pid->inodes);

		for (i = ns->level; i >= 0; i--) {
		int tid = 0;
		int pid_max = READ_ONCE(tmp->pid_max);
		/*
		* 2. perm check checkpoint_restore_ns_capable()
		*
		* This stores found pid_max to make sure the used value is the same should
		* later code need it.
		*/
		for (tmp = ns, i = ns->level; i >= 0; i--) {
		pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);

		if (set_tid_size) {
		tid = set_tid[ns->level - i];
		if (arg_set_tid_size) {
		int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];

		retval = -EINVAL;
		if (tid < 1 \|\| tid >= pid_max)
		goto out_free;
		if (tid < 1 \|\| tid >= pid_max[ns->level - i])
		goto out_abort;
		/*
		* Also fail if a PID != 1 is requested and
		* no PID 1 exists.
		*/
		if (tid != 1 && !tmp->child_reaper)
		goto out_free;
		goto out_abort;
		retval = -EPERM;
		if (!checkpoint_restore_ns_capable(tmp->user_ns))
		goto out_free;
		set_tid_size--;
		goto out_abort;
		arg_set_tid_size--;
		}

		tmp = tmp->parent;
		}

		/*
		* Prep is done, id allocation goes here:
		*/
		retried_preload = false;
		idr_preload(GFP_KERNEL);
		spin_lock(&pidmap_lock);
		for (tmp = ns, i = ns->level; i >= 0;) {
		int tid = set_tid[ns->level - i];

		if (tid) {
		nr = idr_alloc(&tmp->idr, NULL, tid,
		@@ -220,6 +248,7 @@ struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,
		* alreay in use. Return EEXIST in that case.
		*/
		if (nr == -ENOSPC)

		nr = -EEXIST;
		} else {
		int pid_min = 1;
		@@ -235,19 +264,42 @@ struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,
		* a partially initialized PID (see below).
		*/
		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
		pid_max, GFP_ATOMIC);
		pid_max[ns->level - i], GFP_ATOMIC);
		if (nr == -ENOSPC)
		nr = -EAGAIN;
		}

		if (unlikely(nr < 0)) {
		/*
		* Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
		*
		* The IDR API only allows us to preload memory for one call, while we may end
		* up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
		* salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
		* did not help (the routine unfortunately returns void, so we have no idea
		* if it got anywhere).
		*
		* The lock can be safely dropped and picked up as historically pid allocation
		* for different namespaces was not atomic -- we try to hold on to it the
		* entire time only for performance reasons.
		*/
		if (nr == -ENOMEM && !retried_preload) {
		spin_unlock(&pidmap_lock);
		idr_preload_end();

		if (nr < 0) {
		retval = (nr == -ENOSPC) ? -EAGAIN : nr;
		retried_preload = true;
		idr_preload(GFP_KERNEL);
		spin_lock(&pidmap_lock);
		continue;
		}
		retval = nr;
		goto out_free;
		}

		pid->numbers[i].nr = nr;
		pid->numbers[i].ns = tmp;
		tmp = tmp->parent;
		i--;
		retried_preload = false;
		}

		/*
		@@ -257,25 +309,15 @@ struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,
		* is what we have exposed to userspace for a long time and it is
		* documented behavior for pid namespaces. So we can't easily
		* change it even if there were an error code better suited.
		*
		* This can't be done earlier because we need to preserve other
		* error conditions.
		*/
		retval = -ENOMEM;

		get_pid_ns(ns);
		refcount_set(&pid->count, 1);
		spin_lock_init(&pid->lock);
		for (type = 0; type < PIDTYPE_MAX; ++type)
		INIT_HLIST_HEAD(&pid->tasks[type]);

		init_waitqueue_head(&pid->wait_pidfd);
		INIT_HLIST_HEAD(&pid->inodes);

		upid = pid->numbers + ns->level;
		idr_preload(GFP_KERNEL);
		spin_lock(&pidmap_lock);
		if (!(ns->pid_allocated & PIDNS_ADDING))
		goto out_unlock;
		if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
		goto out_free;
		pidfs_add_pid(pid);
		for ( ; upid >= pid->numbers; --upid) {
		for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
		/* Make the PID visible to find_pid_ns. */
		idr_replace(&upid->ns->idr, pid, upid->nr);
		upid->ns->pid_allocated++;
		@@ -286,13 +328,7 @@ struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,

		return pid;

		out_unlock:
		spin_unlock(&pidmap_lock);
		idr_preload_end();
		put_pid_ns(ns);

		out_free:
		spin_lock(&pidmap_lock);
		while (++i <= ns->level) {
		upid = pid->numbers + i;
		idr_remove(&upid->ns->idr, upid->nr);
		@@ -303,7 +339,10 @@ struct pid alloc_pid(struct pid_namespace ns, pid_t *set_tid,
		idr_set_cursor(&ns->idr, 0);

		spin_unlock(&pidmap_lock);
		idr_preload_end();

		out_abort:
		put_pid_ns(ns);
		kmem_cache_free(ns->pid_cachep, pid);
		return ERR_PTR(retval);
		}