Commit 543b9b63 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pidfs updates from Christian Brauner:

 - pid: introduce task_ppid_vnr() helper

 - pidfs: convert rb-tree to rhashtable

   Mateusz reported performance penalties during task creation because
   pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
   rhashtable to have separate fine-grained locking and to decouple from
   pidmap_lock moving all heavy manipulations outside of it

   Also move inode allocation outside of pidmap_lock. With this there's
   nothing happening for pidfs under pidmap_lock

 - pid: reorder fields in pid_namespace to reduce false sharing

 - Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie
   callers"

 - ipc: Add SPDX license id to mqueue.c

* tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  pid: introduce task_ppid_vnr() helper
  pidfs: implement ino allocation without the pidmap lock
  Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers"
  pid: reorder fields in pid_namespace to reduce false sharing
  pidfs: convert rb-tree to rhashtable
  ipc: Add SPDX license id to mqueue.c
parents 57d76cec 3673dd3c
Loading
Loading
Loading
Loading
+95 −79
Original line number Diff line number Diff line
@@ -21,7 +21,9 @@
#include <linux/utsname.h>
#include <net/net_namespace.h>
#include <linux/coredump.h>
#include <linux/rhashtable.h>
#include <linux/xattr.h>
#include <linux/cookie.h>

#include "internal.h"
#include "mount.h"
@@ -55,9 +57,48 @@ struct pidfs_attr {
	__u32 coredump_signal;
};

static struct rb_root pidfs_ino_tree = RB_ROOT;
static struct rhashtable pidfs_ino_ht;

static const struct rhashtable_params pidfs_ino_ht_params = {
	.key_offset		= offsetof(struct pid, ino),
	.key_len		= sizeof(u64),
	.head_offset		= offsetof(struct pid, pidfs_hash),
	.automatic_shrinking	= true,
};

/*
 * inode number handling
 *
 * On 64 bit nothing special happens. The 64bit number assigned
 * to struct pid is the inode number.
 *
 * On 32 bit the 64 bit number assigned to struct pid is split
 * into two 32 bit numbers. The lower 32 bits are used as the
 * inode number and the upper 32 bits are used as the inode
 * generation number.
 *
 * On 32 bit pidfs_ino() will return the lower 32 bit. When
 * pidfs_ino() returns zero a wrap around happened. When a
 * wraparound happens the 64 bit number will be incremented by 1
 * so inode numbering starts at 1 again.
 *
 * On 64 bit comparing two pidfds is as simple as comparing
 * inode numbers.
 *
 * When a wraparound happens on 32 bit multiple pidfds with the
 * same inode number are likely to exist (This isn't a problem
 * since before pidfs pidfds used the anonymous inode meaning
 * all pidfds had the same inode number.). Userspace can
 * reconstruct the 64 bit identifier by retrieving both the
 * inode number and the inode generation number to compare or
 * use file handles.
 */

#if BITS_PER_LONG == 32

DEFINE_SPINLOCK(pidfs_ino_lock);
static u64 pidfs_ino_nr = 1;

static inline unsigned long pidfs_ino(u64 ino)
{
	return lower_32_bits(ino);
@@ -69,6 +110,18 @@ static inline u32 pidfs_gen(u64 ino)
	return upper_32_bits(ino);
}

static inline u64 pidfs_alloc_ino(void)
{
	u64 ino;

	spin_lock(&pidfs_ino_lock);
	if (pidfs_ino(pidfs_ino_nr) == 0)
		pidfs_ino_nr++;
	ino = pidfs_ino_nr++;
	spin_unlock(&pidfs_ino_lock);
	return ino;
}

#else

/* On 64 bit simply return ino. */
@@ -82,69 +135,47 @@ static inline u32 pidfs_gen(u64 ino)
{
	return 0;
}
#endif

static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
{
	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
	u64 pid_ino_a = pid_a->ino;
	u64 pid_ino_b = pid_b->ino;

	if (pid_ino_a < pid_ino_b)
		return -1;
	if (pid_ino_a > pid_ino_b)
		return 1;
	return 0;
}
DEFINE_COOKIE(pidfs_ino_cookie);

void pidfs_add_pid(struct pid *pid)
static u64 pidfs_alloc_ino(void)
{
	static u64 pidfs_ino_nr = 2;
	u64 ino;

	/*
	 * On 64 bit nothing special happens. The 64bit number assigned
	 * to struct pid is the inode number.
	 *
	 * On 32 bit the 64 bit number assigned to struct pid is split
	 * into two 32 bit numbers. The lower 32 bits are used as the
	 * inode number and the upper 32 bits are used as the inode
	 * generation number.
	 *
	 * On 32 bit pidfs_ino() will return the lower 32 bit. When
	 * pidfs_ino() returns zero a wrap around happened. When a
	 * wraparound happens the 64 bit number will be incremented by 2
	 * so inode numbering starts at 2 again.
	 *
	 * On 64 bit comparing two pidfds is as simple as comparing
	 * inode numbers.
	 *
	 * When a wraparound happens on 32 bit multiple pidfds with the
	 * same inode number are likely to exist (This isn't a problem
	 * since before pidfs pidfds used the anonymous inode meaning
	 * all pidfds had the same inode number.). Userspace can
	 * reconstruct the 64 bit identifier by retrieving both the
	 * inode number and the inode generation number to compare or
	 * use file handles.
	 */
	if (pidfs_ino(pidfs_ino_nr) == 0)
		pidfs_ino_nr += 2;
	preempt_disable();
	ino = gen_cookie_next(&pidfs_ino_cookie);
	preempt_enable();

	VFS_WARN_ON_ONCE(ino < 1);
	return ino;
}

#endif

	pid->ino = pidfs_ino_nr;
void pidfs_prepare_pid(struct pid *pid)
{
	pid->stashed = NULL;
	pid->attr = NULL;
	pidfs_ino_nr++;
	pid->ino = 0;
}

int pidfs_add_pid(struct pid *pid)
{
	int ret;

	write_seqcount_begin(&pidmap_lock_seq);
	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
	write_seqcount_end(&pidmap_lock_seq);
	pid->ino = pidfs_alloc_ino();
	ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
				     pidfs_ino_ht_params);
	if (unlikely(ret))
		pid->ino = 0;
	return ret;
}

void pidfs_remove_pid(struct pid *pid)
{
	write_seqcount_begin(&pidmap_lock_seq);
	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
	write_seqcount_end(&pidmap_lock_seq);
	if (likely(pid->ino))
		rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
				       pidfs_ino_ht_params);
}

void pidfs_free_pid(struct pid *pid)
@@ -415,7 +446,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
	 * the fields are set correctly, or return ESRCH to avoid providing
	 * incomplete information. */

	kinfo.ppid = task_ppid_nr_ns(task, NULL);
	kinfo.ppid = task_ppid_vnr(task);
	kinfo.tgid = task_tgid_vnr(task);
	kinfo.pid = task_pid_vnr(task);
	kinfo.mask |= PIDFD_INFO_PID;
@@ -791,42 +822,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
	return FILEID_KERNFS;
}

static int pidfs_ino_find(const void *key, const struct rb_node *node)
{
	const u64 pid_ino = *(u64 *)key;
	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);

	if (pid_ino < pid->ino)
		return -1;
	if (pid_ino > pid->ino)
		return 1;
	return 0;
}

/* Find a struct pid based on the inode number. */
static struct pid *pidfs_ino_get_pid(u64 ino)
{
	struct pid *pid;
	struct rb_node *node;
	unsigned int seq;
	struct pidfs_attr *attr;

	guard(rcu)();
	do {
		seq = read_seqcount_begin(&pidmap_lock_seq);
		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
		if (node)
			break;
	} while (read_seqcount_retry(&pidmap_lock_seq, seq));

	if (!node)
	pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
	if (!pid)
		return NULL;
	attr = READ_ONCE(pid->attr);
	if (IS_ERR_OR_NULL(attr))
		return NULL;
	if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
		return NULL;

	pid = rb_entry(node, struct pid, pidfs_node);

	/* Within our pid namespace hierarchy? */
	if (pid_vnr(pid) == 0)
		return NULL;

	return get_pid(pid);
}

@@ -1104,6 +1117,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)

void __init pidfs_init(void)
{
	if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
		panic("Failed to initialize pidfs hashtable");

	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
+7 −2
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/rhashtable-types.h>
#include <linux/sched.h>
#include <linux/wait.h>

@@ -60,7 +61,7 @@ struct pid {
	spinlock_t lock;
	struct {
		u64 ino;
		struct rb_node pidfs_node;
		struct rhash_head pidfs_hash;
		struct dentry *stashed;
		struct pidfs_attr *attr;
	};
@@ -73,7 +74,6 @@ struct pid {
	struct upid numbers[];
};

extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;

struct file;
@@ -310,6 +310,11 @@ static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_na
	return pid;
}

static inline pid_t task_ppid_vnr(const struct task_struct *tsk)
{
	return task_ppid_nr_ns(tsk, NULL);
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
	return task_ppid_nr_ns(tsk, &init_pid_ns);
+7 −7
Original line number Diff line number Diff line
@@ -27,6 +27,13 @@ struct pid_namespace {
	struct idr idr;
	struct rcu_head rcu;
	unsigned int pid_allocated;
#ifdef CONFIG_SYSCTL
#if defined(CONFIG_MEMFD_CREATE)
	int memfd_noexec_scope;
#endif
	struct ctl_table_set	set;
	struct ctl_table_header *sysctls;
#endif
	struct task_struct *child_reaper;
	struct kmem_cache *pid_cachep;
	unsigned int level;
@@ -40,13 +47,6 @@ struct pid_namespace {
	int reboot;	/* group exit code if this pidns was rebooted */
	struct ns_common ns;
	struct work_struct	work;
#ifdef CONFIG_SYSCTL
	struct ctl_table_set	set;
	struct ctl_table_header *sysctls;
#if defined(CONFIG_MEMFD_CREATE)
	int memfd_noexec_scope;
#endif
#endif
} __randomize_layout;

extern struct pid_namespace init_pid_ns;
+2 −1
Original line number Diff line number Diff line
@@ -6,7 +6,8 @@ struct coredump_params;

struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
void pidfs_add_pid(struct pid *pid);
void pidfs_prepare_pid(struct pid *pid);
int pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
void pidfs_exit(struct task_struct *tsk);
#ifdef CONFIG_COREDUMP
+1 −2
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * POSIX message queues filesystem for Linux.
 *
@@ -9,8 +10,6 @@
 *			    Manfred Spraul	    (manfred@colorfullife.com)
 *
 * Audit:                   George Wilson           (ltcgcw@us.ibm.com)
 *
 * This file is released under the GPL.
 */

#include <linux/capability.h>
Loading