Unverified Commit 68b95373 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "pid_namespace: make init creation more flexible"

Pavel Tikhomirov <ptikhomirov@virtuozzo.com> says:

The first patch properly annotates accesses to ->child_reaper with
_ONCE macroses, to protect unlocked accesses from possible cpu/compiler
optimization problems.

The second patch makes sure that the init is always a first process in
the pid namespace, previously this was only checked for set_tid case.

The third patch allows to join pid namespace before pid namespace init
is created, that allows to create pid namespace by one process and then
create pid namespace init from another process after setns(). Please see
the detailed description in the patch commit message. It depends on the
second patch.

The forth and the final patch is a comprehansive test, that tests both
basic usecase of creating pid namespace and init separately, and a more
specific usecase which shows how we can improve clone3(set_tid)
usability after this change.

This change is generally useful as it makes clone3(set_tid) more
universal, and let's it work in all the cases evenly. Also it is highly
useful to CRIU to handle nested containers.

* patches from https://patch.msgid.link/20260318122157.280595-1-ptikhomirov@virtuozzo.com:
  MAINTAINERS: add a new entry for testing pidns init creation via setns
  selftests: Add tests for creating pidns init via setns
  pid_namespace: allow opening pid_for_children before init was created
  pid: check init is created first after idr alloc
  pid_namespace: avoid optimization of accesses to ->child_reaper

Link: https://patch.msgid.link/20260318122157.280595-1-ptikhomirov@virtuozzo.com


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 1f318b96 2b46715f
Loading
Loading
Loading
Loading
+9 −2
Original line number Diff line number Diff line
@@ -18154,6 +18154,15 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/mtd/linux.git nand/next
F:	drivers/mtd/nand/
F:	include/linux/mtd/*nand*.h
NAMESPACES:
M:	Christian Brauner <christian@brauner.io>
R:	Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
L:	linux-kernel@vger.kernel.org
S:	Maintained
F:	rust/kernel/pid_namespace.rs
F:	kernel/pid_namespace.c
F:	tools/testing/selftests/pid_namespace/
NATIONAL INSTRUMENTS SERIAL DRIVER
M:	Chaitanya Vadrevu <chaitanya.vadrevu@emerson.com>
L:	linux-serial@vger.kernel.org
@@ -20759,10 +20768,8 @@ M: Christian Brauner <christian@brauner.io>
L:	linux-kernel@vger.kernel.org
S:	Maintained
T:	git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
F:	rust/kernel/pid_namespace.rs
F:	samples/pidfd/
F:	tools/testing/selftests/clone3/
F:	tools/testing/selftests/pid_namespace/
F:	tools/testing/selftests/pidfd/
K:	(?i)pidfd
K:	(?i)clone3
+2 −1
Original line number Diff line number Diff line
@@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father,

	reaper = find_alive_thread(father);
	if (reaper) {
		pid_ns->child_reaper = reaper;
		ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper);
		WRITE_ONCE(pid_ns->child_reaper, reaper);
		return reaper;
	}

+4 −1
Original line number Diff line number Diff line
@@ -2423,7 +2423,10 @@ __latent_entropy struct task_struct *copy_process(
			init_task_pid(p, PIDTYPE_SID, task_session(current));

			if (is_child_reaper(pid)) {
				ns_of_pid(pid)->child_reaper = p;
				struct pid_namespace *ns = ns_of_pid(pid);

				ASSERT_EXCLUSIVE_WRITER(ns->child_reaper);
				WRITE_ONCE(ns->child_reaper, p);
				p->signal->flags |= SIGNAL_UNKILLABLE;
			}
			p->signal->shared_pending.signal = delayed.signal;
+11 −8
Original line number Diff line number Diff line
@@ -128,7 +128,7 @@ void free_pid(struct pid *pid)
			 * is the reaper wake up the reaper.  The reaper
			 * may be sleeping in zap_pid_ns_processes().
			 */
			wake_up_process(ns->child_reaper);
			wake_up_process(READ_ONCE(ns->child_reaper));
			break;
		case PIDNS_ADDING:
			/* Handle a fork failure of the first process */
@@ -215,12 +215,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
			retval = -EINVAL;
			if (tid < 1 || tid >= pid_max[ns->level - i])
				goto out_abort;
			/*
			 * Also fail if a PID != 1 is requested and
			 * no PID 1 exists.
			 */
			if (tid != 1 && !tmp->child_reaper)
				goto out_abort;
			retval = -EPERM;
			if (!checkpoint_restore_ns_capable(tmp->user_ns))
				goto out_abort;
@@ -296,9 +290,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,

		pid->numbers[i].nr = nr;
		pid->numbers[i].ns = tmp;
		tmp = tmp->parent;
		i--;
		retried_preload = false;

		/*
		 * PID 1 (init) must be created first.
		 */
		if (!READ_ONCE(tmp->child_reaper) && nr != 1) {
			retval = -EINVAL;
			goto out_free;
		}

		tmp = tmp->parent;
	}

	/*
+0 −9
Original line number Diff line number Diff line
@@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task)
	}
	task_unlock(task);

	if (ns) {
		read_lock(&tasklist_lock);
		if (!ns->child_reaper) {
			put_pid_ns(ns);
			ns = NULL;
		}
		read_unlock(&tasklist_lock);
	}

	return ns ? &ns->ns : NULL;
}

Loading