Unverified Commit 923ea4d4 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid"

Christian Brauner <brauner@kernel.org> says:

SO_PEERPIDFD currently doesn't support handing out pidfds if the
sk->sk_peer_pid thread-group leader has already been reaped. In this
case it currently returns EINVAL. Userspace still wants to get a pidfd
for a reaped process to have a stable handle it can pass on.
This is especially useful now that it is possible to retrieve exit
information through a pidfd via the PIDFD_GET_INFO ioctl()'s
PIDFD_INFO_EXIT flag.

Another summary has been provided by David in [1]:

> A pidfd can outlive the task it refers to, and thus user-space must
> already be prepared that the task underlying a pidfd is gone at the time
> they get their hands on the pidfd. For instance, resolving the pidfd to
> a PID via the fdinfo must be prepared to read `-1`.
>
> Despite user-space knowing that a pidfd might be stale, several kernel
> APIs currently add another layer that checks for this. In particular,
> SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped,
> but returns a stale pidfd if the task is reaped immediately after the
> respective alive-check.
>
> This has the unfortunate effect that user-space now has two ways to
> check for the exact same scenario: A syscall might return
> EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no
> particular reason to distinguish both cases. This also propagates
> through user-space APIs, which pass on pidfds. They must be prepared to
> pass on `-1` *or* the pidfd, because there is no guaranteed way to get a
> stale pidfd from the kernel.
> Userspace must already deal with a pidfd referring to a reaped task as
> the task may exit and get reaped at any time will there are still many
> pidfds referring to it.

In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure
that PIDFD_INFO_EXIT information is available whenever a pidfd for a
reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped
pidfds are only handed out if it is guaranteed that the caller sees the
exit information:

TEST_F(pidfd_info, success_reaped)
{
        struct pidfd_info info = {
                .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT,
        };

        /*
         * Process has already been reaped and PIDFD_INFO_EXIT been set.
         * Verify that we can retrieve the exit status of the process.
         */
        ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0);
        ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS));
        ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT));
        ASSERT_TRUE(WIFEXITED(info.exit_code));
        ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
}

To hand out pidfds for reaped processes we thus allocate a pidfs entry
for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is
stashed and drop it when the socket is destroyed. This guarantees that
exit information will always be recorded for the sk->sk_peer_pid task
and we can hand out pidfds for reaped processes.

* patches from https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org:
  net, pidfs: enable handing out pidfds for reaped sk->sk_peer_pid
  pidfs: get rid of __pidfd_prepare()
  net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid
  pidfs: register pid in pidfs

Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-0-450a19461e75@kernel.org


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents b590c928 20b70e58
Loading
Loading
Loading
Loading
+72 −9
Original line number Diff line number Diff line
@@ -768,7 +768,7 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
{
	enum pid_type type;

	if (flags & PIDFD_CLONE)
	if (flags & PIDFD_STALE)
		return true;

	/*
@@ -777,10 +777,14 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
	 * pidfd has been allocated perform another check that the pid
	 * is still alive. If it is exit information is available even
	 * if the task gets reaped before the pidfd is returned to
	 * userspace. The only exception is PIDFD_CLONE where no task
	 * linkage has been established for @pid yet and the kernel is
	 * in the middle of process creation so there's nothing for
	 * pidfs to miss.
	 * userspace. The only exception are indicated by PIDFD_STALE:
	 *
	 * (1) The kernel is in the middle of task creation and thus no
	 *     task linkage has been established yet.
	 * (2) The caller knows @pid has been registered in pidfs at a
	 *     time when the task was still alive.
	 *
	 * In both cases exit information will have been reported.
	 */
	if (flags & PIDFD_THREAD)
		type = PIDTYPE_PID;
@@ -874,11 +878,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
	int ret;

	/*
	 * Ensure that PIDFD_CLONE can be passed as a flag without
	 * Ensure that PIDFD_STALE can be passed as a flag without
	 * overloading other uapi pidfd flags.
	 */
	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_THREAD);
	BUILD_BUG_ON(PIDFD_CLONE == PIDFD_NONBLOCK);
	BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
	BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);

	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
	if (ret < 0)
@@ -887,7 +891,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
	if (!pidfs_pid_valid(pid, &path, flags))
		return ERR_PTR(-ESRCH);

	flags &= ~PIDFD_CLONE;
	flags &= ~PIDFD_STALE;
	pidfd_file = dentry_open(&path, flags, current_cred());
	/* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
	if (!IS_ERR(pidfd_file))
@@ -896,6 +900,65 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
	return pidfd_file;
}

/**
 * pidfs_register_pid - register a struct pid in pidfs
 * @pid: pid to pin
 *
 * Register a struct pid in pidfs. Needs to be paired with
 * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
 *
 * Return: On success zero, on error a negative error code is returned.
 */
int pidfs_register_pid(struct pid *pid)
{
	struct path path __free(path_put) = {};
	int ret;

	might_sleep();

	if (!pid)
		return 0;

	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
	if (unlikely(ret))
		return ret;
	/* Keep the dentry and only put the reference to the mount. */
	path.dentry = NULL;
	return 0;
}

/**
 * pidfs_get_pid - pin a struct pid through pidfs
 * @pid: pid to pin
 *
 * Similar to pidfs_register_pid() but only valid if the caller knows
 * there's a reference to the @pid through a dentry already that can't
 * go away.
 */
void pidfs_get_pid(struct pid *pid)
{
	if (!pid)
		return;
	WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
}

/**
 * pidfs_put_pid - drop a pidfs reference
 * @pid: pid to drop
 *
 * Drop a reference to @pid via pidfs. This is only safe if the
 * reference has been taken via pidfs_get_pid().
 */
void pidfs_put_pid(struct pid *pid)
{
	might_sleep();

	if (!pid)
		return;
	VFS_WARN_ON_ONCE(!pid->stashed);
	dput(pid->stashed);
}

static void pidfs_inode_init_once(void *data)
{
	struct pidfs_inode *pi = data;
+1 −1
Original line number Diff line number Diff line
@@ -77,7 +77,7 @@ struct file;
struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
+3 −0
Original line number Diff line number Diff line
@@ -8,5 +8,8 @@ void pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
void pidfs_exit(struct task_struct *tsk);
extern const struct dentry_operations pidfs_dentry_operations;
int pidfs_register_pid(struct pid *pid);
void pidfs_get_pid(struct pid *pid);
void pidfs_put_pid(struct pid *pid);

#endif /* _LINUX_PID_FS_H */
+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@
#define PIDFD_THREAD	O_EXCL
#ifdef __KERNEL__
#include <linux/sched.h>
#define PIDFD_CLONE CLONE_PIDFD
#define PIDFD_STALE CLONE_PIDFD
#endif

/* Flags for pidfd_send_signal(). */
+29 −54
Original line number Diff line number Diff line
@@ -2035,55 +2035,11 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the file for the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
	struct file *pidfd_file;

	CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
	if (pidfd < 0)
		return pidfd;

	pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
	if (IS_ERR(pidfd_file))
		return PTR_ERR(pidfd_file);

	*ret = pidfd_file;
	return take_fd(pidfd);
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the pidfd.
 * @ret_file: return the new pidfs file
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
@@ -2106,16 +2062,26 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
	struct file *pidfs_file;

	/*
	 * PIDFD_STALE is only allowed to be passed if the caller knows
	 * that @pid is already registered in pidfs and thus
	 * PIDFD_INFO_EXIT information is guaranteed to be available.
	 */
	if (!(flags & PIDFD_STALE)) {
		/*
	 * While holding the pidfd waitqueue lock removing the task
	 * linkage for the thread-group leader pid (PIDTYPE_TGID) isn't
	 * possible. Thus, if there's still task linkage for PIDTYPE_PID
	 * not having thread-group leader linkage for the pid means it
	 * wasn't a thread-group leader in the first place.
		 * While holding the pidfd waitqueue lock removing the
		 * task linkage for the thread-group leader pid
		 * (PIDTYPE_TGID) isn't possible. Thus, if there's still
		 * task linkage for PIDTYPE_PID not having thread-group
		 * leader linkage for the pid means it wasn't a
		 * thread-group leader in the first place.
		 */
	scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) {
		guard(spinlock_irq)(&pid->wait_pidfd.lock);

		/* Task has already been reaped. */
		if (!pid_has_task(pid, PIDTYPE_PID))
			return -ESRCH;
@@ -2128,7 +2094,16 @@ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
			return -ENOENT;
	}

	return __pidfd_prepare(pid, flags, ret);
	CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
	if (pidfd < 0)
		return pidfd;

	pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
	if (IS_ERR(pidfs_file))
		return PTR_ERR(pidfs_file);

	*ret_file = pidfs_file;
	return take_fd(pidfd);
}

static void __delayed_free_task(struct rcu_head *rhp)
@@ -2477,7 +2452,7 @@ __latent_entropy struct task_struct *copy_process(
		 * Note that no task has been attached to @pid yet indicate
		 * that via CLONE_PIDFD.
		 */
		retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
		retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
		if (retval < 0)
			goto bad_fork_free_pid;
		pidfd = retval;
Loading