Commit 64bef697 authored by Oleg Nesterov's avatar Oleg Nesterov Committed by Christian Brauner
Browse files

pidfd: implement PIDFD_THREAD flag for pidfd_open()



With this flag:

	- pidfd_open() doesn't require that the target task must be
	  a thread-group leader

	- pidfd_poll() succeeds when the task exits and becomes a
	  zombie (iow, passes exit_notify()), even if it is a leader
	  and thread-group is not empty.

	  This means that the behaviour of pidfd_poll(PIDFD_THREAD,
	  pid-of-group-leader) is not well defined if it races with
	  exec() from its sub-thread; pidfd_poll() can succeed or not
	  depending on whether pidfd_task_exited() is called before
	  or after exchange_tids().

	  Perhaps we can improve this behaviour later, pidfd_poll()
	  can probably take sig->group_exec_task into account. But
	  this doesn't really differ from the case when the leader
	  exits before other threads (so pidfd_poll() succeeds) and
	  then another thread execs and pidfd_poll() will block again.

thread_group_exited() is no longer used, perhaps it can die.

Co-developed-by: default avatarTycho Andersen <tycho@tycho.pizza>
Signed-off-by: default avatarOleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com


Tested-by: default avatarTycho Andersen <tandersen@netflix.com>
Reviewed-by: default avatarTycho Andersen <tandersen@netflix.com>
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent 21e25205
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)

		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
		leader->exit_state = EXIT_DEAD;

		/*
		 * leader and tsk exhanged their pids, the old pid dies,
		 * wake up the PIDFD_THREAD waiters.
		 */
		do_notify_pidfd(leader);
		/*
		 * We are going to release_task()->ptrace_unlink() silently,
		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
+2 −1
Original line number Diff line number Diff line
@@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;

struct file;

extern struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
+2 −1
Original line number Diff line number Diff line
@@ -8,5 +8,6 @@

/* Flags for pidfd_open().  */
#define PIDFD_NONBLOCK	O_NONBLOCK
#define PIDFD_THREAD	O_EXCL

#endif /* _UAPI_LINUX_PIDFD_H */
+7 −0
Original line number Diff line number Diff line
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
		kill_orphaned_pgrp(tsk->group_leader, NULL);

	tsk->exit_state = EXIT_ZOMBIE;
	/*
	 * sub-thread or delay_group_leader(), wake up the
	 * PIDFD_THREAD waiters.
	 */
	if (!thread_group_empty(tsk))
		do_notify_pidfd(tsk);

	if (unlikely(tsk->ptrace)) {
		int sig = thread_group_leader(tsk) &&
				thread_group_empty(tsk) &&
+31 −7
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)

	seq_put_decimal_ll(m, "Pid:\t", nr);

	/* TODO: report PIDFD_THREAD */

#ifdef CONFIG_PID_NS
	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
	if (nr > 0) {
@@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif

static bool pidfd_task_exited(struct pid *pid, bool thread)
{
	struct task_struct *task;
	bool exited;

	rcu_read_lock();
	task = pid_task(pid, PIDTYPE_PID);
	exited = !task ||
		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
	rcu_read_unlock();

	return exited;
}

/*
 * Poll support for process exit notification.
 */
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
	struct pid *pid = file->private_data;
	bool thread = file->f_flags & PIDFD_THREAD;
	__poll_t poll_flags = 0;

	poll_wait(file, &pid->wait_pidfd, pts);

	/*
	 * Inform pollers only when the whole thread group exits.
	 * If the thread group leader exits before all other threads in the
	 * group, then poll(2) should block, similar to the wait(2) family.
	 * Depending on PIDFD_THREAD, inform pollers when the thread
	 * or the whole thread-group exits.
	 */
	if (thread_group_exited(pid))
	if (pidfd_task_exited(pid, thread))
		poll_flags = EPOLLIN | EPOLLRDNORM;

	return poll_flags;
@@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
		return PTR_ERR(pidfd_file);
	}
	get_pid(pid); /* held by pidfd_file now */
	/*
	 * anon_inode_getfile() ignores everything outside of the
	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
	 */
	pidfd_file->f_flags |= (flags & PIDFD_THREAD);
	*ret = pidfd_file;
	return pidfd;
}
@@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is used as a thread group leader.
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
	bool thread = flags & PIDFD_THREAD;

	if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
		return -EINVAL;

	return __pidfd_prepare(pid, flags, ret);
Loading