Unverified Commit 87caaeef authored by Mateusz Guzik's avatar Mateusz Guzik Committed by Christian Brauner
Browse files

pidfs: implement ino allocation without the pidmap lock



This paves the way for scalable PID allocation later.

The 32 bit variant merely takes a spinlock for simplicity, the 64 bit
variant uses a scalable scheme.

Signed-off-by: default avatarMateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20260120184539.1480930-1-mjguzik@gmail.com


Co-developed-by: default avatarChristian Brauner <brauner@kernel.org>
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent 03aef060
Loading
Loading
Loading
Loading
+72 −41
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@
#include <linux/coredump.h>
#include <linux/rhashtable.h>
#include <linux/xattr.h>
#include <linux/cookie.h>

#include "internal.h"
#include "mount.h"
@@ -65,7 +66,39 @@ static const struct rhashtable_params pidfs_ino_ht_params = {
	.automatic_shrinking	= true,
};

/*
 * inode number handling
 *
 * On 64 bit nothing special happens. The 64bit number assigned
 * to struct pid is the inode number.
 *
 * On 32 bit the 64 bit number assigned to struct pid is split
 * into two 32 bit numbers. The lower 32 bits are used as the
 * inode number and the upper 32 bits are used as the inode
 * generation number.
 *
 * On 32 bit pidfs_ino() will return the lower 32 bit. When
 * pidfs_ino() returns zero a wrap around happened. When a
 * wraparound happens the 64 bit number will be incremented by 1
 * so inode numbering starts at 1 again.
 *
 * On 64 bit comparing two pidfds is as simple as comparing
 * inode numbers.
 *
 * When a wraparound happens on 32 bit multiple pidfds with the
 * same inode number are likely to exist (This isn't a problem
 * since before pidfs pidfds used the anonymous inode meaning
 * all pidfds had the same inode number.). Userspace can
 * reconstruct the 64 bit identifier by retrieving both the
 * inode number and the inode generation number to compare or
 * use file handles.
 */

#if BITS_PER_LONG == 32

DEFINE_SPINLOCK(pidfs_ino_lock);
static u64 pidfs_ino_nr = 1;

static inline unsigned long pidfs_ino(u64 ino)
{
	return lower_32_bits(ino);
@@ -77,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
	return upper_32_bits(ino);
}

static inline u64 pidfs_alloc_ino(void)
{
	u64 ino;

	spin_lock(&pidfs_ino_lock);
	if (pidfs_ino(pidfs_ino_nr) == 0)
		pidfs_ino_nr++;
	ino = pidfs_ino_nr++;
	spin_unlock(&pidfs_ino_lock);
	return ino;
}

#else

/* On 64 bit simply return ino. */
@@ -90,59 +135,45 @@ static inline u32 pidfs_gen(u64 ino)
{
	return 0;
}
#endif

/*
 * Allocate inode number and initialize pidfs fields.
 * Called with pidmap_lock held.
 */
void pidfs_prepare_pid(struct pid *pid)
DEFINE_COOKIE(pidfs_ino_cookie);

static u64 pidfs_alloc_ino(void)
{
	static u64 pidfs_ino_nr = 2;
	u64 ino;

	/*
	 * On 64 bit nothing special happens. The 64bit number assigned
	 * to struct pid is the inode number.
	 *
	 * On 32 bit the 64 bit number assigned to struct pid is split
	 * into two 32 bit numbers. The lower 32 bits are used as the
	 * inode number and the upper 32 bits are used as the inode
	 * generation number.
	 *
	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
	 * pidfs_ino() returns zero a wrap around happened. When a
	 * wraparound happens the 64 bit number will be incremented by 2
	 * so inode numbering starts at 2 again.
	 *
	 * On 64 bit comparing two pidfds is as simple as comparing
	 * inode numbers.
	 *
	 * When a wraparound happens on 32 bit multiple pidfds with the
	 * same inode number are likely to exist (This isn't a problem
	 * since before pidfs pidfds used the anonymous inode meaning
	 * all pidfds had the same inode number.). Userspace can
	 * reconstruct the 64 bit identifier by retrieving both the
	 * inode number and the inode generation number to compare or
	 * use file handles.
	 */
	if (pidfs_ino(pidfs_ino_nr) == 0)
		pidfs_ino_nr += 2;
	preempt_disable();
	ino = gen_cookie_next(&pidfs_ino_cookie);
	preempt_enable();

	VFS_WARN_ON_ONCE(ino < 1);
	return ino;
}

	pid->ino = pidfs_ino_nr;
	pid->pidfs_hash.next = NULL;
#endif

void pidfs_prepare_pid(struct pid *pid)
{
	pid->stashed = NULL;
	pid->attr = NULL;
	pidfs_ino_nr++;
	pid->ino = 0;
}

int pidfs_add_pid(struct pid *pid)
{
	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
	int ret;

	pid->ino = pidfs_alloc_ino();
	ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
				     pidfs_ino_ht_params);
	if (unlikely(ret))
		pid->ino = 0;
	return ret;
}

void pidfs_remove_pid(struct pid *pid)
{
	if (likely(pid->ino))
		rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
				       pidfs_ino_ht_params);
}
+1 −2
Original line number Diff line number Diff line
@@ -198,6 +198,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
		INIT_HLIST_HEAD(&pid->tasks[type]);
	init_waitqueue_head(&pid->wait_pidfd);
	INIT_HLIST_HEAD(&pid->inodes);
	pidfs_prepare_pid(pid);

	/*
	 * 2. perm check checkpoint_restore_ns_capable()
@@ -314,8 +315,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
	retval = -ENOMEM;
	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
		goto out_free;
	pidfs_prepare_pid(pid);

	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
		/* Make the PID visible to find_pid_ns. */
		idr_replace(&upid->ns->idr, pid, upid->nr);