From 3c17001b21b9f168c957ced9384abe969019b609 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:24 +0200 Subject: [PATCH 01/65] pidfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Aleksa Sarai Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- include/linux/fs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..0a5083b9cce5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..2f2edc53bf3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -4023,4 +4023,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ From 4d906371d1f9fc9ce47b2c8f37444680246557bc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:25 +0200 Subject: [PATCH 02/65] nsfs: drop tautological ioctl() check This check does absolutely nothing. So drop it. Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 59aa801347a7..ce51041bb624 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -163,7 +163,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return true; } /* Extensible ioctls require some extra handling. */ From f8527a29f4619f74bc30a9845ea87abb9a6faa1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:26 +0200 Subject: [PATCH 03/65] nsfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/nsfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index ce51041bb624..d016d36272d4 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -169,9 +169,11 @@ static bool nsfs_ioctl_valid(unsigned int cmd) /* Extensible ioctls require some extra handling. */ switch (_IOC_NR(cmd)) { case _IOC_NR(NS_MNT_GET_INFO): + return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_NEXT): + return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); case _IOC_NR(NS_MNT_GET_PREV): - return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); } return false; From 38f4885088fc5ad41b8b0a2a2cfc73d01e709e5c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Aug 2025 16:35:55 -0400 Subject: [PATCH 04/65] mnt_ns_tree_remove(): DTRT if mnt_ns had never been added to mnt_ns_list Actual removal is done under the lock, but for checking if need to bother the lockless RB_EMPTY_NODE() is safe - either that namespace had never been added to mnt_ns_tree, in which case the the node will stay empty, or whoever had allocated it has called mnt_ns_tree_add() and it has already run to completion. After that point RB_EMPTY_NODE() will become false and will remain false, no matter what we do with other nodes in the tree. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index ae6d1312b184..39afeb521a80 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -187,7 +187,7 @@ static void mnt_ns_release_rcu(struct rcu_head *rcu) static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!is_anon_ns(ns)) { + if (!RB_EMPTY_NODE(&ns->mnt_ns_tree_node)) { mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); list_bidir_del_rcu(&ns->mnt_ns_list); From fa8ee8627b7411f2157aad59d3d2a68ab00adeeb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:27 +0200 Subject: [PATCH 05/65] block: use extensible_ioctl_valid() Use the new extensible_ioctl_valid() helper which is equivalent to what is done here. Reviewed-by: Jens Axboe Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/blk-integrity.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 056b8948369d..ce08ad4565e2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -58,16 +58,14 @@ new_segment: int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp) { - struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk); + struct blk_integrity *bi; struct logical_block_metadata_cap meta_cap = {}; size_t usize = _IOC_SIZE(cmd); - if (_IOC_DIR(cmd) != _IOC_DIR(FS_IOC_GETLBMD_CAP) || - _IOC_TYPE(cmd) != _IOC_TYPE(FS_IOC_GETLBMD_CAP) || - _IOC_NR(cmd) != _IOC_NR(FS_IOC_GETLBMD_CAP) || - _IOC_SIZE(cmd) < LBMD_SIZE_VER0) + if (!extensible_ioctl_valid(cmd, FS_IOC_GETLBMD_CAP, LBMD_SIZE_VER0)) return -ENOIOCTLCMD; + bi = blk_get_integrity(bdev->bd_disk); if (!bi) goto out; From e3e1812f8e25ac277f5cc9249802365300c582e3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:28 +0200 Subject: [PATCH 06/65] ns: move to_ns_common() to ns_common.h Move the helper to ns_common.h where it belongs. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 20 ++++++++++++++++++++ include/linux/nsproxy.h | 11 ----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b098..bc2e0758e1c9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -6,6 +6,15 @@ struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -13,4 +22,15 @@ struct ns_common { refcount_t count; }; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns) + #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..e6bec522b139 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. From 9296f46a9645cf753d2522093485cebe77635aa6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:29 +0200 Subject: [PATCH 07/65] nsfs: add nsfs.h header And move the stuff out from proc_ns.h where it really doesn't belong. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 26 ++++++++++++++++++++++++++ include/linux/proc_ns.h | 13 +------------ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 include/linux/nsfs.h diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 000000000000..fb84aa538091 --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#endif /* _LINUX_NSFS_H */ + diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783..5e1a4b378b79 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include +#include #include struct pid_namespace; @@ -75,16 +75,5 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ From 660def10b01b248fd97255afacb7b0e305ac833a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:30 +0200 Subject: [PATCH 08/65] ns: uniformly initialize ns_common No point in cargo-culting the same code across all the different types. Use one common initializer. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5e1a4b378b79..dbb119bda097 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -72,6 +72,22 @@ static inline int ns_alloc_inum(struct ns_common *ns) return proc_alloc_inum(&ns->inum); } +static inline int ns_common_init(struct ns_common *ns, + const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + return 0; +} + #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) From 0b40774ef06cee7c3334dcee068f06d331ff2749 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:31 +0200 Subject: [PATCH 09/65] cgroup: use ns_common_init() Don't cargo-cult the same thing over and over. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/cgroup/namespace.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..0391b6ab0bf1 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -21,20 +21,16 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) From 90d4d9f4d235673da83574f0f84be3e37f955a39 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:32 +0200 Subject: [PATCH 10/65] ipc: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- ipc/namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe..d4188a88ee57 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -61,12 +61,10 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &ipcns_operations, true); if (err) goto fail_free; - ns->ns.ops = &ipcns_operations; - refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; From 96ece8eb676453d09036c9a1eed7b680267f7778 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:33 +0200 Subject: [PATCH 11/65] mnt: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..14c5cdbdd6e1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4177,18 +4177,15 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - if (!anon) { - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); - } + + ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); } - new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_inc_return(&mnt_ns_seq); - refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; INIT_LIST_HEAD(&new_ns->mnt_ns_list); From 08027f6b790be1e444e4182fb4dc53faa6539d16 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:34 +0200 Subject: [PATCH 12/65] net: use ns_common_init() Don't cargo-cult the same thing over and over. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..5fb7bd8ac45a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -397,10 +397,22 @@ static __net_init void preinit_net_sysctl(struct net *net) } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + const struct proc_ns_operations *ns_ops; + int ret; + +#ifdef CONFIG_NET_NS + ns_ops = &netns_operations; +#else + ns_ops = NULL; +#endif + + ret = ns_common_init(&net->ns, ns_ops, false); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +432,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -559,7 +572,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -812,15 +827,15 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } + int ret = 0; + + if (net == &init_net) + net->ns.inum = PROC_NET_INIT_INO; + else + ret = proc_alloc_inum(&to_ns_common(net)->inum); + if (ret) + return ret; + net_ns_net_debugfs(net); return 0; } @@ -1282,7 +1297,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) From 8e199cd6e3303d8f9352030ab9e4a221f277bd51 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:35 +0200 Subject: [PATCH 13/65] pid: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..20ce4052d1c5 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -102,17 +102,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &pidns_operations, true); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); From 7b0e2c83624b02a8a11f04f0581721d95ea6e4a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:36 +0200 Subject: [PATCH 14/65] time: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- kernel/time/namespace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..0be93d8f2896 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -88,22 +88,19 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &timens_operations, true); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; From 00ed42285c46e2c84223e3ec5e1d07638948b4d1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:37 +0200 Subject: [PATCH 15/65] user: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d..98f4fe84d039 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -124,12 +124,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(&ns->ns, &userns_operations, true); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; From 09337e064cbb692d5e0e34d7e1e2f2c53166f91a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:38 +0200 Subject: [PATCH 16/65] uts: use ns_common_init() Don't cargo-cult the same thing over and over. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/utsname.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..02037010b378 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -27,16 +27,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,17 +45,15 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(&ns->ns, &utsns_operations, true); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); From 86c5aba210b145d7de011a5abaf9b785aa70a183 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:39 +0200 Subject: [PATCH 17/65] ns: remove ns_alloc_inum() It's now unused. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index dbb119bda097..e50d312f9fee 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,12 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - static inline int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) From 885fc8ac0a4dc70f5d87b80b0977292870e35c60 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:40 +0200 Subject: [PATCH 18/65] nstree: make iterator generic Move the namespace iteration infrastructure originally introduced for mount namespaces into a generic library usable by all namespace types. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 9 ++ include/linux/nstree.h | 91 +++++++++++++++ include/linux/proc_ns.h | 3 + kernel/Makefile | 2 +- kernel/nstree.c | 233 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 include/linux/nstree.h create mode 100644 kernel/nstree.c diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bc2e0758e1c9..7224072cccc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_H #include +#include struct proc_ns_operations; @@ -20,6 +21,14 @@ struct ns_common { const struct proc_ns_operations *ops; unsigned int inum; refcount_t count; + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; #define to_ns_common(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 000000000000..29ad6402260c --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include +#include +#include +#include +#include +#include + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e50d312f9fee..7f89f0829e60 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -79,6 +79,9 @@ static inline int ns_common_init(struct ns_common *ns, refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..b807516a1b43 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..bbe8bedc924c --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} From 7d7d164989586c0ad61934975c40ca795dc134c7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:41 +0200 Subject: [PATCH 19/65] mnt: support ns lookup Move the mount namespace to the generic ns lookup infrastructure. This allows us to drop a bunch of members from struct mnt_namespace. Signed-off-by: Christian Brauner --- fs/mount.h | 10 +--- fs/namespace.c | 139 +++++++++++-------------------------------------- fs/nsfs.c | 4 +- 3 files changed, 33 insertions(+), 120 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 97737051a8b9..76bf863c9ae2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -17,11 +17,7 @@ struct mnt_namespace { }; struct user_namespace *user_ns; struct ucounts *ucounts; - u64 seq; /* Sequence number to prevent loops */ - union { - wait_queue_head_t poll; - struct rcu_head mnt_ns_rcu; - }; + wait_queue_head_t poll; u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; #ifdef CONFIG_FSNOTIFY @@ -30,8 +26,6 @@ struct mnt_namespace { #endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; - struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ - struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -173,7 +167,7 @@ static inline bool is_local_mountpoint(const struct dentry *dentry) static inline bool is_anon_ns(struct mnt_namespace *ns) { - return ns->seq == 0; + return ns->ns.ns_id == 0; } static inline bool anon_ns_root(const struct mount *m) diff --git a/fs/namespace.c b/fs/namespace.c index 5b2c15f7170f..a68998449698 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "pnode.h" #include "internal.h" @@ -80,13 +81,10 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ -static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif -static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ -static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), @@ -119,53 +117,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { + struct ns_common *ns; + if (!node) return NULL; - return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); -} - -static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) -{ - struct mnt_namespace *ns_a = node_to_mnt_ns(a); - struct mnt_namespace *ns_b = node_to_mnt_ns(b); - u64 seq_a = ns_a->seq; - u64 seq_b = ns_b->seq; - - if (seq_a < seq_b) - return -1; - if (seq_a > seq_b) - return 1; - return 0; -} - -static inline void mnt_ns_tree_write_lock(void) -{ - write_seqlock(&mnt_ns_tree_lock); -} - -static inline void mnt_ns_tree_write_unlock(void) -{ - write_sequnlock(&mnt_ns_tree_lock); -} - -static void mnt_ns_tree_add(struct mnt_namespace *ns) -{ - struct rb_node *node, *prev; - - mnt_ns_tree_write_lock(); - node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->mnt_ns_tree_node); - if (!prev) - list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); - else - list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); - mnt_ns_tree_write_unlock(); - - WARN_ON_ONCE(node); + ns = rb_entry(node, struct ns_common, ns_tree_node); + return container_of(ns, struct mnt_namespace, ns); } static void mnt_ns_release(struct mnt_namespace *ns) @@ -181,32 +138,16 @@ DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { - mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); + mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ - if (!RB_EMPTY_NODE(&ns->mnt_ns_tree_node)) { - mnt_ns_tree_write_lock(); - rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); - list_bidir_del_rcu(&ns->mnt_ns_list); - mnt_ns_tree_write_unlock(); - } + if (ns_tree_active(ns)) + ns_tree_remove(ns); - call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); -} - -static int mnt_ns_find(const void *key, const struct rb_node *node) -{ - const u64 mnt_ns_id = *(u64 *)key; - const struct mnt_namespace *ns = node_to_mnt_ns(node); - - if (mnt_ns_id < ns->seq) - return -1; - if (mnt_ns_id > ns->seq) - return 1; - return 0; + call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); } /* @@ -225,28 +166,21 @@ static int mnt_ns_find(const void *key, const struct rb_node *node) */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; - struct rb_node *node; - unsigned int seq; + struct mnt_namespace *mnt_ns; + struct ns_common *ns; guard(rcu)(); - do { - seq = read_seqbegin(&mnt_ns_tree_lock); - node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); - if (node) - break; - } while (read_seqretry(&mnt_ns_tree_lock, seq)); - - if (!node) + ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); + if (!ns) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ - ns = node_to_mnt_ns(node); - refcount_inc(&ns->passive); - return ns; + mnt_ns = container_of(ns, struct mnt_namespace, ns); + refcount_inc(&mnt_ns->passive); + return mnt_ns; } static inline void lock_mount_hash(void) @@ -1017,7 +951,7 @@ static inline bool check_anonymous_mnt(struct mount *mnt) return false; seq = mnt->mnt_ns->seq_origin; - return !seq || (seq == current->nsproxy->mnt_ns->seq); + return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); } /* @@ -2152,19 +2086,16 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { + struct ns_common *ns; + guard(rcu)(); for (;;) { - struct list_head *list; + ns = ns_tree_adjoined_rcu(mntns, previous); + if (IS_ERR(ns)) + return ERR_CAST(ns); - if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); - else - list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); - if (list_is_head(list, &mnt_ns_list)) - return ERR_PTR(-ENOENT); - - mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + mntns = to_mnt_ns(ns); /* * The last passive reference count is put with RCU @@ -2204,7 +2135,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!mnt_ns) return false; - return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; + return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, @@ -3080,7 +3011,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else - ns->seq_origin = src_mnt_ns->seq; + ns->seq_origin = src_mnt_ns->ns.ns_id; } mnt = __do_loopback(path, recursive); @@ -4156,15 +4087,6 @@ static void free_mnt_ns(struct mnt_namespace *ns) mnt_ns_tree_remove(ns); } -/* - * Assign a sequence number so we can detect when we attempt to bind - * mount a reference to an older mount namespace into the current - * mount namespace, preventing reference counting loops. A 64bit - * number incrementing at 10Ghz will take 12,427 years to wrap which - * is effectively never, so we can ignore the possibility. - */ -static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); - static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; @@ -4188,11 +4110,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a return ERR_PTR(ret); } if (!anon) - new_ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns_tree_gen_id(&new_ns->ns); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; - INIT_LIST_HEAD(&new_ns->mnt_ns_list); - RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -4278,7 +4198,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); - mnt_ns_tree_add(new_ns); + ns_tree_add_raw(new_ns); return new_ns; } @@ -5397,7 +5317,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; - s->sm.mnt_ns_id = ns->seq; + s->sm.mnt_ns_id = ns->ns.ns_id; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) @@ -6102,7 +6022,6 @@ static void __init init_mount_tree(void) ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - ns->seq = atomic64_inc_return(&mnt_ns_seq); ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; @@ -6117,7 +6036,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - mnt_ns_tree_add(ns); + ns_tree_add(ns); } void __init mnt_init(void) diff --git a/fs/nsfs.c b/fs/nsfs.c index d016d36272d4..80e631aeb3ce 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -139,7 +139,7 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, * the size value will be set to the size the kernel knows about. */ kinfo->size = min(usize, sizeof(*kinfo)); - kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->mnt_ns_id = mnt_ns->ns.ns_id; kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); /* Subtract the root mount of the mount namespace. */ if (kinfo->nr_mounts) @@ -221,7 +221,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, mnt_ns = container_of(ns, struct mnt_namespace, ns); idp = (__u64 __user *)arg; - id = mnt_ns->seq; + id = mnt_ns->ns.ns_id; return put_user(id, idp); } case NS_GET_PID_FROM_PIDNS: From 7c60593985331e7839ec3fea6328a3253a325e82 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:42 +0200 Subject: [PATCH 20/65] cgroup: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/cgroup/cgroup.c | 2 ++ kernel/cgroup/namespace.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..092e6bf081ed 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -6312,6 +6313,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 0391b6ab0bf1..fc12c416dfeb 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include #include #include - +#include /* cgroup namespaces */ @@ -30,16 +30,19 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); if (ret) return ERR_PTR(ret); + ns_tree_add(new_ns); return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); From 74b24a582e1ff3960f0454f57afc2bcdbc52562e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:43 +0200 Subject: [PATCH 21/65] ipc: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- ipc/msgutil.c | 1 + ipc/namespace.c | 3 +++ ipc/shm.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/ipc/msgutil.c b/ipc/msgutil.c index c7be0c792647..bbf61275df41 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "util.h" diff --git a/ipc/namespace.c b/ipc/namespace.c index d4188a88ee57..9f923c1a1eb3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "util.h" @@ -85,6 +86,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, sem_init_ns(ns); shm_init_ns(ns); + ns_tree_add(ns); return ns; @@ -201,6 +203,7 @@ void put_ipc_ns(struct ipc_namespace *ns) mq_clear_sbinfo(ns); spin_unlock(&mq_lock); + ns_tree_remove(ns); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } diff --git a/ipc/shm.c b/ipc/shm.c index a9310b6dbbc3..3db36773dd10 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -148,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns) static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); + ns_tree_add(&init_ipc_ns); return 0; } From 195f7422298d711e89643369988ed285d484dd74 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:44 +0200 Subject: [PATCH 22/65] net: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. The network namespace has a separate list with different lifetime rules which we can just leave in tact. We have a similar concept for mount namespaces as well where it is on two differenet lists for different purposes. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5fb7bd8ac45a..169ec22c4758 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -445,7 +446,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(&net->ns); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -455,6 +456,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -674,8 +676,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). From 488acdcec8e24377506934a95e0ba21619073e8f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:45 +0200 Subject: [PATCH 23/65] pid: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/pid_namespace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 20ce4052d1c5..228ae20299f9 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "pid_sysctl.h" @@ -122,6 +123,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: @@ -147,6 +149,7 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); ns_free_inum(&ns->ns); @@ -473,6 +476,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } From b36c823b9a4be5b0c8e38c3fd60cade7d41c216c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:46 +0200 Subject: [PATCH 24/65] time: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 5 +++++ init/main.c | 2 ++ kernel/time/namespace.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..7f6af7a9771e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,7 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); @@ -108,6 +109,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..e7d2c57c65a7 100644 --- a/init/main.c +++ b/init/main.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -1072,6 +1073,7 @@ void start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); + time_ns_init(); key_init(); security_init(); dbg_late_init(); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0be93d8f2896..408f60d0a3b6 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -250,11 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); __free_page(ns->vvar_page); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct time_namespace *to_time_ns(struct ns_common *ns) @@ -487,3 +491,8 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} From 2f5243cbba6cff66cc8b43bdc14853282f5b1e67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:47 +0200 Subject: [PATCH 25/65] user: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 98f4fe84d039..ade5b6806c5c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -158,6 +159,7 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -200,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,7 +221,8 @@ static void free_user_ns(struct work_struct *work) retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); @@ -1412,6 +1416,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); From 58f976d41fd915acd2403a2b8e0eef8e9c478357 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:48 +0200 Subject: [PATCH 26/65] uts: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Signed-off-by: Christian Brauner --- kernel/utsname.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/utsname.c b/kernel/utsname.c index 02037010b378..64155417ae0c 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include #include #include +#include #include static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -58,6 +59,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -93,10 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) @@ -162,4 +166,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: [PATCH 27/65] ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..9ca25346f7cb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..924e4754374f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..ba0efc8c8596 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e..a47a4ce4183e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..a09056ad090e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..5d34c4f0f945 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..fd090ceb80bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb3..89588819956b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb..5a327914b565 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9..9b327420309e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6..20b65f90549e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c..cfb0e28f2779 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c..a682830742d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758..a57b3cda8dbc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); From d2afdb73f8ad77b49eca9d110d0c54bf30d1df0f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:50 +0200 Subject: [PATCH 28/65] nsfs: add current_in_namespace() Add a helper to easily check whether a given namespace is the caller's current namespace. This is currently open-coded in a lot of places. Simply switch on the type and compare the results. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index fb84aa538091..e5a5fa83d36b 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -5,6 +5,8 @@ #define _LINUX_NSFS_H #include +#include +#include struct path; struct task_struct; @@ -22,5 +24,17 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); void nsfs_init(void); -#endif /* _LINUX_NSFS_H */ +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) + +#endif /* _LINUX_NSFS_H */ From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: [PATCH 29/65] nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce..926e2680414e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139f..fa86fe3c8bd3 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ From e83f0b5d10dcf62833008327cb661c7d118bca85 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:52 +0200 Subject: [PATCH 30/65] nsfs: support exhaustive file handles Pidfd file handles are exhaustive meaning they don't require a handle on another pidfd to pass to open_by_handle_at() so it can derive the filesystem to decode in. Instead it can be derived from the file handle itself. The same is possible for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 ++++++ fs/internal.h | 1 + fs/nsfs.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 1 + 4 files changed, 18 insertions(+) diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..dd734d8828d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/nsfs.c b/fs/nsfs.c index 926e2680414e..22765fcab18e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -25,6 +25,14 @@ static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -598,4 +606,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94eb..3741ea1b73d8 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ From f861225b9ee9cb2da1c7b2f5f921856cb8ca86bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:53 +0200 Subject: [PATCH 31/65] nsfs: add missing id retrieval support The mount namespace has supported id retrieval for a while already. Add support for the other types as well. Signed-off-by: Christian Brauner --- fs/nsfs.c | 25 +++++++++++++------------ include/uapi/linux/nsfs.h | 6 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 22765fcab18e..8484bc4dd3de 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -177,6 +177,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: + case NS_GET_ID: return true; } @@ -226,18 +227,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->ns.ns_id; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -283,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index fa86fe3c8bd3..5d5bf22464c9 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, From 87a1716c7d6551e59ca4bd9cdcdf04c67deaa337 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:54 +0200 Subject: [PATCH 32/65] tools: update nsfs.h uapi header Update the nsfs.h tools header to the uapi/nsfs.h header so we can rely on it in the selftests. Signed-off-by: Christian Brauner --- tools/include/uapi/linux/nsfs.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/nsfs.h b/tools/include/uapi/linux/nsfs.h index 34127653fd00..33c9b578b3b2 100644 --- a/tools/include/uapi/linux/nsfs.h +++ b/tools/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,4 +40,19 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + +enum init_ns_ino { + IPC_NS_INIT_INO = 0xEFFFFFFFU, + UTS_NS_INIT_INO = 0xEFFFFFFEU, + USER_NS_INIT_INO = 0xEFFFFFFDU, + PID_NS_INIT_INO = 0xEFFFFFFCU, + CGROUP_NS_INIT_INO = 0xEFFFFFFBU, + TIME_NS_INIT_INO = 0xEFFFFFFAU, + NET_NS_INIT_INO = 0xEFFFFFF9U, + MNT_NS_INIT_INO = 0xEFFFFFF8U, +}; + #endif /* __LINUX_NSFS_H */ From 14f98438f0ed7b121a847260023de64ae128887e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:55 +0200 Subject: [PATCH 33/65] selftests/namespaces: add identifier selftests Add a bunch of selftests for the identifier retrieval ioctls. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 7 + tools/testing/selftests/namespaces/config | 7 + .../testing/selftests/namespaces/nsid_test.c | 986 ++++++++++++++++++ 4 files changed, 1001 insertions(+) create mode 100644 tools/testing/selftests/namespaces/.gitignore create mode 100644 tools/testing/selftests/namespaces/Makefile create mode 100644 tools/testing/selftests/namespaces/config create mode 100644 tools/testing/selftests/namespaces/nsid_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore new file mode 100644 index 000000000000..c1e8d634dd21 --- /dev/null +++ b/tools/testing/selftests/namespaces/.gitignore @@ -0,0 +1 @@ +nsid_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile new file mode 100644 index 000000000000..9280c703533e --- /dev/null +++ b/tools/testing/selftests/namespaces/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) + +TEST_GEN_PROGS := nsid_test + +include ../lib.mk + diff --git a/tools/testing/selftests/namespaces/config b/tools/testing/selftests/namespaces/config new file mode 100644 index 000000000000..d09836260262 --- /dev/null +++ b/tools/testing/selftests/namespaces/config @@ -0,0 +1,7 @@ +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/namespaces/nsid_test.c b/tools/testing/selftests/namespaces/nsid_test.c new file mode 100644 index 000000000000..e28accd74a57 --- /dev/null +++ b/tools/testing/selftests/namespaces/nsid_test.c @@ -0,0 +1,986 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +TEST(nsid_mntns_basic) +{ + __u64 mnt_ns_id = 0; + int fd_mntns; + int ret; + + /* Open the current mount namespace */ + fd_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_mntns, 0); + + /* Get the mount namespace ID */ + ret = ioctl(fd_mntns, NS_GET_MNTNS_ID, &mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(mnt_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 mnt_ns_id2 = 0; + ret = ioctl(fd_mntns, NS_GET_ID, &mnt_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mnt_ns_id, mnt_ns_id2); + + close(fd_mntns); +} + +TEST(nsid_mntns_separate) +{ + __u64 parent_mnt_ns_id = 0; + __u64 child_mnt_ns_id = 0; + int fd_parent_mntns, fd_child_mntns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's mount namespace ID */ + fd_parent_mntns = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(fd_parent_mntns, 0); + ret = ioctl(fd_parent_mntns, NS_GET_ID, &parent_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_mnt_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_mntns); + SKIP(return, "No permission to create mount namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's mount namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/mnt", pid); + fd_child_mntns = open(path, O_RDONLY); + ASSERT_GE(fd_child_mntns, 0); + + /* Get child's mount namespace ID */ + ret = ioctl(fd_child_mntns, NS_GET_ID, &child_mnt_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_mnt_ns_id, 0); + + /* Parent and child should have different mount namespace IDs */ + ASSERT_NE(parent_mnt_ns_id, child_mnt_ns_id); + + close(fd_parent_mntns); + close(fd_child_mntns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_cgroupns_basic) +{ + __u64 cgroup_ns_id = 0; + int fd_cgroupns; + int ret; + + /* Open the current cgroup namespace */ + fd_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_cgroupns, 0); + + /* Get the cgroup namespace ID */ + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(cgroup_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 cgroup_ns_id2 = 0; + ret = ioctl(fd_cgroupns, NS_GET_ID, &cgroup_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(cgroup_ns_id, cgroup_ns_id2); + + close(fd_cgroupns); +} + +TEST(nsid_cgroupns_separate) +{ + __u64 parent_cgroup_ns_id = 0; + __u64 child_cgroup_ns_id = 0; + int fd_parent_cgroupns, fd_child_cgroupns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's cgroup namespace ID */ + fd_parent_cgroupns = open("/proc/self/ns/cgroup", O_RDONLY); + ASSERT_GE(fd_parent_cgroupns, 0); + ret = ioctl(fd_parent_cgroupns, NS_GET_ID, &parent_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_cgroup_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_cgroupns); + SKIP(return, "No permission to create cgroup namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's cgroup namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/cgroup", pid); + fd_child_cgroupns = open(path, O_RDONLY); + ASSERT_GE(fd_child_cgroupns, 0); + + /* Get child's cgroup namespace ID */ + ret = ioctl(fd_child_cgroupns, NS_GET_ID, &child_cgroup_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_cgroup_ns_id, 0); + + /* Parent and child should have different cgroup namespace IDs */ + ASSERT_NE(parent_cgroup_ns_id, child_cgroup_ns_id); + + close(fd_parent_cgroupns); + close(fd_child_cgroupns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_ipcns_basic) +{ + __u64 ipc_ns_id = 0; + int fd_ipcns; + int ret; + + /* Open the current IPC namespace */ + fd_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_ipcns, 0); + + /* Get the IPC namespace ID */ + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(ipc_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 ipc_ns_id2 = 0; + ret = ioctl(fd_ipcns, NS_GET_ID, &ipc_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(ipc_ns_id, ipc_ns_id2); + + close(fd_ipcns); +} + +TEST(nsid_ipcns_separate) +{ + __u64 parent_ipc_ns_id = 0; + __u64 child_ipc_ns_id = 0; + int fd_parent_ipcns, fd_child_ipcns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's IPC namespace ID */ + fd_parent_ipcns = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(fd_parent_ipcns, 0); + ret = ioctl(fd_parent_ipcns, NS_GET_ID, &parent_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_ipc_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_ipcns); + SKIP(return, "No permission to create IPC namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's IPC namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/ipc", pid); + fd_child_ipcns = open(path, O_RDONLY); + ASSERT_GE(fd_child_ipcns, 0); + + /* Get child's IPC namespace ID */ + ret = ioctl(fd_child_ipcns, NS_GET_ID, &child_ipc_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_ipc_ns_id, 0); + + /* Parent and child should have different IPC namespace IDs */ + ASSERT_NE(parent_ipc_ns_id, child_ipc_ns_id); + + close(fd_parent_ipcns); + close(fd_child_ipcns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_utsns_basic) +{ + __u64 uts_ns_id = 0; + int fd_utsns; + int ret; + + /* Open the current UTS namespace */ + fd_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_utsns, 0); + + /* Get the UTS namespace ID */ + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(uts_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 uts_ns_id2 = 0; + ret = ioctl(fd_utsns, NS_GET_ID, &uts_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(uts_ns_id, uts_ns_id2); + + close(fd_utsns); +} + +TEST(nsid_utsns_separate) +{ + __u64 parent_uts_ns_id = 0; + __u64 child_uts_ns_id = 0; + int fd_parent_utsns, fd_child_utsns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's UTS namespace ID */ + fd_parent_utsns = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(fd_parent_utsns, 0); + ret = ioctl(fd_parent_utsns, NS_GET_ID, &parent_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_uts_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_utsns); + SKIP(return, "No permission to create UTS namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's UTS namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid); + fd_child_utsns = open(path, O_RDONLY); + ASSERT_GE(fd_child_utsns, 0); + + /* Get child's UTS namespace ID */ + ret = ioctl(fd_child_utsns, NS_GET_ID, &child_uts_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_uts_ns_id, 0); + + /* Parent and child should have different UTS namespace IDs */ + ASSERT_NE(parent_uts_ns_id, child_uts_ns_id); + + close(fd_parent_utsns); + close(fd_child_utsns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_userns_basic) +{ + __u64 user_ns_id = 0; + int fd_userns; + int ret; + + /* Open the current user namespace */ + fd_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_userns, 0); + + /* Get the user namespace ID */ + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(user_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 user_ns_id2 = 0; + ret = ioctl(fd_userns, NS_GET_ID, &user_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(user_ns_id, user_ns_id2); + + close(fd_userns); +} + +TEST(nsid_userns_separate) +{ + __u64 parent_user_ns_id = 0; + __u64 child_user_ns_id = 0; + int fd_parent_userns, fd_child_userns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's user namespace ID */ + fd_parent_userns = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(fd_parent_userns, 0); + ret = ioctl(fd_parent_userns, NS_GET_ID, &parent_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_user_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new user namespace */ + ret = unshare(CLONE_NEWUSER); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_userns); + SKIP(return, "No permission to create user namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's user namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_child_userns = open(path, O_RDONLY); + ASSERT_GE(fd_child_userns, 0); + + /* Get child's user namespace ID */ + ret = ioctl(fd_child_userns, NS_GET_ID, &child_user_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_user_ns_id, 0); + + /* Parent and child should have different user namespace IDs */ + ASSERT_NE(parent_user_ns_id, child_user_ns_id); + + close(fd_parent_userns); + close(fd_child_userns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_timens_basic) +{ + __u64 time_ns_id = 0; + int fd_timens; + int ret; + + /* Open the current time namespace */ + fd_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get the time namespace ID */ + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(time_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 time_ns_id2 = 0; + ret = ioctl(fd_timens, NS_GET_ID, &time_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(time_ns_id, time_ns_id2); + + close(fd_timens); +} + +TEST(nsid_timens_separate) +{ + __u64 parent_time_ns_id = 0; + __u64 child_time_ns_id = 0; + int fd_parent_timens, fd_child_timens; + int ret; + pid_t pid; + int pipefd[2]; + + /* Open the current time namespace */ + fd_parent_timens = open("/proc/self/ns/time", O_RDONLY); + if (fd_parent_timens < 0) { + SKIP(return, "Time namespaces not supported"); + } + + /* Get parent's time namespace ID */ + ret = ioctl(fd_parent_timens, NS_GET_ID, &parent_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_time_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new time namespace */ + ret = unshare(CLONE_NEWTIME); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES || errno == EINVAL) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_timens); + close(pipefd[0]); + SKIP(return, "Cannot create time namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's time namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/time", grandchild_pid); + fd_child_timens = open(path, O_RDONLY); + ASSERT_GE(fd_child_timens, 0); + + /* Get child's time namespace ID */ + ret = ioctl(fd_child_timens, NS_GET_ID, &child_time_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_time_ns_id, 0); + + /* Parent and child should have different time namespace IDs */ + ASSERT_NE(parent_time_ns_id, child_time_ns_id); + + close(fd_parent_timens); + close(fd_child_timens); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_pidns_basic) +{ + __u64 pid_ns_id = 0; + int fd_pidns; + int ret; + + /* Open the current PID namespace */ + fd_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_pidns, 0); + + /* Get the PID namespace ID */ + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(pid_ns_id, 0); + + /* Verify we can get the same ID again */ + __u64 pid_ns_id2 = 0; + ret = ioctl(fd_pidns, NS_GET_ID, &pid_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(pid_ns_id, pid_ns_id2); + + close(fd_pidns); +} + +TEST(nsid_pidns_separate) +{ + __u64 parent_pid_ns_id = 0; + __u64 child_pid_ns_id = 0; + int fd_parent_pidns, fd_child_pidns; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's PID namespace ID */ + fd_parent_pidns = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(fd_parent_pidns, 0); + ret = ioctl(fd_parent_pidns, NS_GET_ID, &parent_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_pid_ns_id, 0); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new PID namespace */ + ret = unshare(CLONE_NEWPID); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Fork a grandchild to actually enter the new namespace */ + pid_t grandchild = fork(); + if (grandchild == 0) { + /* Grandchild is in the new namespace */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + pause(); + _exit(0); + } else if (grandchild > 0) { + /* Child writes grandchild PID and waits */ + write(pipefd[1], "Y", 1); + write(pipefd[1], &grandchild, sizeof(grandchild)); + close(pipefd[1]); + pause(); /* Keep the parent alive to maintain the grandchild */ + _exit(0); + } else { + _exit(1); + } + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_pidns); + close(pipefd[0]); + SKIP(return, "No permission to create PID namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + pid_t grandchild_pid; + ASSERT_EQ(read(pipefd[0], &grandchild_pid, sizeof(grandchild_pid)), sizeof(grandchild_pid)); + close(pipefd[0]); + + /* Open grandchild's PID namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/pid", grandchild_pid); + fd_child_pidns = open(path, O_RDONLY); + ASSERT_GE(fd_child_pidns, 0); + + /* Get child's PID namespace ID */ + ret = ioctl(fd_child_pidns, NS_GET_ID, &child_pid_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_pid_ns_id, 0); + + /* Parent and child should have different PID namespace IDs */ + ASSERT_NE(parent_pid_ns_id, child_pid_ns_id); + + close(fd_parent_pidns); + close(fd_child_pidns); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST(nsid_netns_basic) +{ + __u64 net_ns_id = 0; + __u64 netns_cookie = 0; + int fd_netns; + int sock; + socklen_t optlen; + int ret; + + /* Open the current network namespace */ + fd_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_netns, 0); + + /* Get the network namespace ID via ioctl */ + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(net_ns_id, 0); + + /* Create a socket to get the SO_NETNS_COOKIE */ + sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(sock, 0); + + /* Get the network namespace cookie via socket option */ + optlen = sizeof(netns_cookie); + ret = getsockopt(sock, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + ASSERT_EQ(optlen, sizeof(netns_cookie)); + + /* The namespace ID and cookie should be identical */ + ASSERT_EQ(net_ns_id, netns_cookie); + + /* Verify we can get the same ID again */ + __u64 net_ns_id2 = 0; + ret = ioctl(fd_netns, NS_GET_ID, &net_ns_id2); + ASSERT_EQ(ret, 0); + ASSERT_EQ(net_ns_id, net_ns_id2); + + close(sock); + close(fd_netns); +} + +TEST(nsid_netns_separate) +{ + __u64 parent_net_ns_id = 0; + __u64 parent_netns_cookie = 0; + __u64 child_net_ns_id = 0; + __u64 child_netns_cookie = 0; + int fd_parent_netns, fd_child_netns; + int parent_sock, child_sock; + socklen_t optlen; + int ret; + pid_t pid; + int pipefd[2]; + + /* Get parent's network namespace ID */ + fd_parent_netns = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(fd_parent_netns, 0); + ret = ioctl(fd_parent_netns, NS_GET_ID, &parent_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(parent_net_ns_id, 0); + + /* Get parent's network namespace cookie */ + parent_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(parent_sock, 0); + optlen = sizeof(parent_netns_cookie); + ret = getsockopt(parent_sock, SOL_SOCKET, SO_NETNS_COOKIE, &parent_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify parent's ID and cookie match */ + ASSERT_EQ(parent_net_ns_id, parent_netns_cookie); + + /* Create a pipe for synchronization */ + ASSERT_EQ(pipe(pipefd), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* Create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret != 0) { + /* Skip test if we don't have permission */ + if (errno == EPERM || errno == EACCES) { + write(pipefd[1], "S", 1); /* Signal skip */ + _exit(0); + } + _exit(1); + } + + /* Signal success */ + write(pipefd[1], "Y", 1); + close(pipefd[1]); + + /* Keep namespace alive */ + pause(); + _exit(0); + } + + /* Parent process */ + close(pipefd[1]); + + char buf; + ASSERT_EQ(read(pipefd[0], &buf, 1), 1); + close(pipefd[0]); + + if (buf == 'S') { + /* Child couldn't create namespace, skip test */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_parent_netns); + close(parent_sock); + SKIP(return, "No permission to create network namespace"); + } + + ASSERT_EQ(buf, 'Y'); + + /* Open child's network namespace */ + char path[256]; + snprintf(path, sizeof(path), "/proc/%d/ns/net", pid); + fd_child_netns = open(path, O_RDONLY); + ASSERT_GE(fd_child_netns, 0); + + /* Get child's network namespace ID */ + ret = ioctl(fd_child_netns, NS_GET_ID, &child_net_ns_id); + ASSERT_EQ(ret, 0); + ASSERT_NE(child_net_ns_id, 0); + + /* Create socket in child's namespace to get cookie */ + ret = setns(fd_child_netns, CLONE_NEWNET); + if (ret == 0) { + child_sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_GE(child_sock, 0); + + optlen = sizeof(child_netns_cookie); + ret = getsockopt(child_sock, SOL_SOCKET, SO_NETNS_COOKIE, &child_netns_cookie, &optlen); + ASSERT_EQ(ret, 0); + + /* Verify child's ID and cookie match */ + ASSERT_EQ(child_net_ns_id, child_netns_cookie); + + close(child_sock); + + /* Return to parent namespace */ + setns(fd_parent_netns, CLONE_NEWNET); + } + + /* Parent and child should have different network namespace IDs */ + ASSERT_NE(parent_net_ns_id, child_net_ns_id); + if (child_netns_cookie != 0) { + ASSERT_NE(parent_netns_cookie, child_netns_cookie); + } + + close(fd_parent_netns); + close(fd_child_netns); + close(parent_sock); + + /* Clean up child process */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); +} + +TEST_HARNESS_MAIN From 28ef38a9a2c7aa4dd8dfbb1d2b12f9ea84044327 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:56 +0200 Subject: [PATCH 34/65] selftests/namespaces: add file handle selftests Add a bunch of selftests for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 2 +- .../selftests/namespaces/file_handle_test.c | 1429 +++++++++++++++++ 3 files changed, 1431 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/namespaces/file_handle_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index c1e8d634dd21..7639dbf58bbf 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1 +1,2 @@ nsid_test +file_handle_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index 9280c703533e..f6c117ce2c2b 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := nsid_test +TEST_GEN_PROGS := nsid_test file_handle_test include ../lib.mk diff --git a/tools/testing/selftests/namespaces/file_handle_test.c b/tools/testing/selftests/namespaces/file_handle_test.c new file mode 100644 index 000000000000..f1bc5773f552 --- /dev/null +++ b/tools/testing/selftests/namespaces/file_handle_test.c @@ -0,0 +1,1429 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../kselftest_harness.h" + +#ifndef FD_NSFS_ROOT +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ +#endif + +TEST(nsfs_net_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT as unprivileged user */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + if (fd < 0 && errno == EPERM) { + SKIP(free(handle); close(ns_fd); + return, + "Permission denied for unprivileged user (expected)"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_uts_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open UTS namespace file descriptor */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_ipc_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open IPC namespace file descriptor */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_pid_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open PID namespace file descriptor */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_mnt_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open mount namespace file descriptor */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open user namespace file descriptor */ + ns_fd = open("/proc/self/ns/user", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_cgroup_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open cgroup namespace file descriptor */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "cgroup namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_time_handle) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + struct stat st1, st2; + + /* Drop to unprivileged uid/gid */ + ASSERT_EQ(setresgid(65534, 65534, 65534), 0); /* nogroup */ + ASSERT_EQ(setresuid(65534, 65534, 65534), 0); /* nobody */ + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open time namespace file descriptor */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); return, "time namespace not available"); + } + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Try to open using FD_NSFS_ROOT */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + if (fd < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { + SKIP(free(handle); close(ns_fd); + return, + "open_by_handle_at with FD_NSFS_ROOT not supported"); + } + ASSERT_GE(fd, 0); + + /* Verify we opened the correct namespace */ + ASSERT_EQ(fstat(ns_fd, &st1), 0); + ASSERT_EQ(fstat(fd, &st2), 0); + ASSERT_EQ(st1.st_ino, st2.st_ino); + ASSERT_EQ(st1.st_dev, st2.st_dev); + + close(fd); + close(ns_fd); + free(handle); +} + +TEST(nsfs_user_net_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current network namespace */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new network namespace */ + ret = unshare(CLONE_NEWNET); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create network namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's network namespace handle from new user+net namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new network namespace"); + } + + /* Should fail with permission denied since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_uts_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current UTS namespace */ + ns_fd = open("/proc/self/ns/uts", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new UTS namespace */ + ret = unshare(CLONE_NEWUTS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create UTS namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's UTS namespace handle from new user+uts namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new UTS namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_ipc_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current IPC namespace */ + ns_fd = open("/proc/self/ns/ipc", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new IPC namespace */ + ret = unshare(CLONE_NEWIPC); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create IPC namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's IPC namespace handle from new user+ipc namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new IPC namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_mnt_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current mount namespace */ + ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new mount namespace */ + ret = unshare(CLONE_NEWNS); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create mount namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's mount namespace handle from new user+mnt namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new mount namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_cgroup_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current cgroup namespace */ + ns_fd = open("/proc/self/ns/cgroup", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "cgroup namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new cgroup namespace */ + ret = unshare(CLONE_NEWCGROUP); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create cgroup namespace */ + close(pipefd[1]); + exit(0); + } + + /* Try to open parent's cgroup namespace handle from new user+cgroup namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new cgroup namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_pid_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current PID namespace */ + ns_fd = open("/proc/self/ns/pid", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new PID namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWPID); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create PID namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for PID namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in PID namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new PID namespace */ + /* Try to open parent's PID namespace handle from new user+pid namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new PID namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_user_time_namespace_isolation) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + pid_t pid; + int status; + int pipefd[2]; + char result; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Create pipe for communication */ + ASSERT_EQ(pipe(pipefd), 0); + + /* Get handle for current time namespace */ + ns_fd = open("/proc/self/ns/time", O_RDONLY); + if (ns_fd < 0) { + SKIP(free(handle); close(pipefd[0]); close(pipefd[1]); + return, "time namespace not available"); + } + + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); close(pipefd[0]); + close(pipefd[1]); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + close(ns_fd); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child process */ + close(pipefd[0]); + + /* First create new user namespace to drop privileges */ + ret = unshare(CLONE_NEWUSER); + if (ret < 0) { + write(pipefd[1], "U", + 1); /* Unable to create user namespace */ + close(pipefd[1]); + exit(0); + } + + /* Write uid/gid mappings to maintain some capabilities */ + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); + + if (uid_map_fd < 0 || gid_map_fd < 0 || setgroups_fd < 0) { + write(pipefd[1], "M", 1); /* Unable to set mappings */ + close(pipefd[1]); + exit(0); + } + + /* Disable setgroups to allow gid mapping */ + write(setgroups_fd, "deny", 4); + close(setgroups_fd); + + /* Map current uid/gid to root in the new namespace */ + char mapping[64]; + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); + write(uid_map_fd, mapping, strlen(mapping)); + close(uid_map_fd); + + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); + write(gid_map_fd, mapping, strlen(mapping)); + close(gid_map_fd); + + /* Now create new time namespace - requires fork to take effect */ + ret = unshare(CLONE_NEWTIME); + if (ret < 0) { + write(pipefd[1], "N", + 1); /* Unable to create time namespace */ + close(pipefd[1]); + exit(0); + } + + /* Fork again for time namespace to take effect */ + pid_t child_pid = fork(); + if (child_pid < 0) { + write(pipefd[1], "N", + 1); /* Unable to fork in time namespace */ + close(pipefd[1]); + exit(0); + } + + if (child_pid == 0) { + /* Grandchild in new time namespace */ + /* Try to open parent's time namespace handle from new user+time namespace */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); + + if (fd >= 0) { + /* Should NOT succeed - we're in a different user namespace */ + write(pipefd[1], "S", + 1); /* Unexpected success */ + close(fd); + } else if (errno == ESTALE) { + /* Expected: Stale file handle */ + write(pipefd[1], "P", 1); + } else { + /* Other error */ + write(pipefd[1], "F", 1); + } + + close(pipefd[1]); + exit(0); + } + + /* Wait for grandchild */ + waitpid(child_pid, NULL, 0); + exit(0); + } + + /* Parent process */ + close(pipefd[1]); + ASSERT_EQ(read(pipefd[0], &result, 1), 1); + + waitpid(pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); + + if (result == 'U') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new user namespace"); + } + if (result == 'M') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot set uid/gid mappings"); + } + if (result == 'N') { + SKIP(free(handle); close(pipefd[0]); + return, "Cannot create new time namespace"); + } + + /* Should fail with ESTALE since we're in a different user namespace */ + ASSERT_EQ(result, 'P'); + + close(pipefd[0]); + free(handle); +} + +TEST(nsfs_open_flags) +{ + struct file_handle *handle; + int mount_id; + int ret; + int fd; + int ns_fd; + + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); + ASSERT_NE(handle, NULL); + + /* Open a namespace file descriptor */ + ns_fd = open("/proc/self/ns/net", O_RDONLY); + ASSERT_GE(ns_fd, 0); + + /* Get handle for the namespace */ + handle->handle_bytes = MAX_HANDLE_SZ; + ret = name_to_handle_at(ns_fd, "", handle, &mount_id, AT_EMPTY_PATH); + if (ret < 0 && errno == EOPNOTSUPP) { + SKIP(free(handle); close(ns_fd); + return, "nsfs doesn't support file handles"); + } + ASSERT_EQ(ret, 0); + ASSERT_GT(handle->handle_bytes, 0); + + /* Test invalid flags that should fail */ + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_WRONLY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDWR); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TRUNC); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EPERM); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECT); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_TMPFILE); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, EINVAL); + + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_DIRECTORY); + ASSERT_LT(fd, 0); + ASSERT_EQ(errno, ENOTDIR); + + close(ns_fd); + free(handle); +} + +TEST_HARNESS_MAIN From 93f67a7ddadf6ed8997c000df9790e5d64617196 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:00 +0200 Subject: [PATCH 35/65] uts: split namespace into separate header We have dedicated headers for all namespace types. Add one for the uts namespace as well. Now it's consistent for all namespace types. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 65 +++++++++++++++++++++++++++++++++++ include/linux/utsname.h | 58 +------------------------------ 2 files changed, 66 insertions(+), 57 deletions(-) create mode 100644 include/linux/uts_namespace.h diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 000000000000..c2b619bb4e57 --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +extern struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 5d34c4f0f945..547bd4439706 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include enum uts_proc { UTS_PROC_ARCH, @@ -18,62 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else From b2a0b192084acd0a86d66cbbc61e17ba1f5bd583 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:01 +0200 Subject: [PATCH 36/65] mnt: expose pointer to init_mnt_ns There's various scenarios where we need to know whether we are in the initial set of namespaces or not to e.g., shortcut permission checking. All namespaces expose that information. Let's do that too. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 27 ++++++++++++++++----------- include/linux/mnt_namespace.h | 2 ++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a68998449698..f0bddc9cf2a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6008,27 +6008,32 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = PROC_MNT_INIT_INO, + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.count = REFCOUNT_INIT(1), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6036,7 +6041,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..6d1c4c218c14 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,6 +11,8 @@ struct fs_struct; struct user_namespace; struct ns_common; +extern struct mnt_namespace init_mnt_ns; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); From f74ca6da113d5d4b21c00bb4da3f3c137162b4fe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:02 +0200 Subject: [PATCH 37/65] nscommon: move to separate file It's really awkward spilling the ns common infrastructure into multiple headers. Move it to a separate file. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 19 ------------------- kernel/Makefile | 2 +- kernel/nscommon.c | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 kernel/nscommon.c diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7224072cccc5..78b17fe80b62 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -31,6 +31,9 @@ struct ns_common { }; }; +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum); + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 7f89f0829e60..9f21670b5824 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,25 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_common_init(struct ns_common *ns, - const struct proc_ns_operations *ops, - bool alloc_inum) -{ - if (alloc_inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } - refcount_set(&ns->count, 1); - ns->stashed = NULL; - ns->ops = ops; - ns->ns_id = 0; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); - return 0; -} - #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) diff --git a/kernel/Makefile b/kernel/Makefile index b807516a1b43..1f48f7cd2d7b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o nstree.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..ebf4783d0505 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include + +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + return 0; +} From 5fc6bef178f1b644f1439e520c8f83bfc83a1252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:03 +0200 Subject: [PATCH 38/65] cgroup: split namespace into separate header We have dedicated headers for all namespace types. Add one for the cgroup namespace as well. Now it's consistent for all namespace types and easy to figure out what to include. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 51 +---------------------------- include/linux/cgroup_namespace.h | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 include/linux/cgroup_namespace.h diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9ca25346f7cb..5156fed8cbc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include #include +#include struct kernel_clone_args; @@ -783,56 +784,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 000000000000..c02bb76c5e32 --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ From cc47f434271ba90c18c16e0bba360df38a8bc954 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:04 +0200 Subject: [PATCH 39/65] nsfs: add inode number for anon namespace Add an inode number anonymous namespaces. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 5d5bf22464c9..e098759ec917 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,6 +53,9 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; struct nsfs_file_handle { From 86cdbae5c61c6b8c0a2adc78dbbb0314b3254a9c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:05 +0200 Subject: [PATCH 40/65] mnt: simplify ns_common_init() handling Assign the reserved MNT_NS_ANON_INO sentinel to anonymous mount namespaces and cleanup the initial mount ns allocation. This is just a preparatory patch and the ns->inum check in ns_common_init() will be dropped in the next patch. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 ++ kernel/nscommon.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index f0bddc9cf2a6..b2fcb901ad8c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4103,6 +4103,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a return ERR_PTR(-ENOMEM); } + if (anon) + new_ns->ns.inum = MNT_NS_ANON_INO; ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); if (ret) { kfree(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index ebf4783d0505..e10fad8afe61 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) { - if (alloc_inum) { + if (alloc_inum && !ns->inum) { int ret; ret = proc_alloc_inum(&ns->inum); if (ret) From d5b27cb8c5f30c972e041b30bc38fa5875b1a469 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:06 +0200 Subject: [PATCH 41/65] net: centralize ns_common initialization Centralize ns_common initialization. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a57b3cda8dbc..9df236811454 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, false); + ret = ns_common_init(&net->ns, ns_ops, true); if (ret) return ret; @@ -590,6 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_free_inum(&net->ns); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -712,6 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_free_inum(&net->ns); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -831,31 +833,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { - int ret = 0; - - if (net == &init_net) - net->ns.inum = PROC_NET_INIT_INO; - else - ret = proc_alloc_inum(&to_ns_common(net)->inum); - if (ret) - return ret; - net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { From 5612ff3ec588be09f11a9424db6d1186bcdeb3fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:07 +0200 Subject: [PATCH 42/65] nscommon: simplify initialization There's a lot of information that namespace implementers don't need to know about at all. Encapsulate this all in the initialization helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 5 +++-- include/linux/ns_common.h | 39 +++++++++++++++++++++++++++++++++++++-- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 17 ++++++++--------- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 10 files changed, 55 insertions(+), 20 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index b2fcb901ad8c..699b8c770c47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,8 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - new_ns->ns.inum = MNT_NS_ANON_INO; - ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns, &mntns_operations); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 78b17fe80b62..05c7a7dd211b 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -16,6 +16,15 @@ struct time_namespace; struct user_namespace; struct uts_namespace; +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -31,8 +40,7 @@ struct ns_common { }; }; -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum); +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -45,4 +53,31 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, struct user_namespace *: &(__ns)->ns, \ struct uts_namespace *: &(__ns)->ns) +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_common_init(__ns, __ops) \ + __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) + #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index 89588819956b..0f8bbd18a475 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(&ns->ns, &ipcns_operations, true); + err = ns_common_init(ns, &ipcns_operations); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 5a327914b565..d928c557e28b 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + ret = ns_common_init(new_ns, &cgroupns_operations); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index e10fad8afe61..c3a90bb665ad 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum) +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - if (alloc_inum && !ns->inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); - return 0; + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9b327420309e..170757c265c2 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(&ns->ns, &pidns_operations, true); + err = ns_common_init(ns, &pidns_operations); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 20b65f90549e..ce8e952104a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(&ns->ns, &timens_operations, true); + err = ns_common_init(ns, &timens_operations); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index cfb0e28f2779..db9f0463219c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(&ns->ns, &userns_operations, true); + ret = ns_common_init(ns, &userns_operations); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index a682830742d3..399888be66bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(&ns->ns, &utsns_operations, true); + err = ns_common_init(ns, &utsns_operations); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9df236811454..e50897fba8cd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, true); + ret = ns_common_init(net, ns_ops); if (ret) return ret; From 224ef741ce87aa6474b82e0eb76e0e8e1bafe544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:46 +0200 Subject: [PATCH 43/65] ns: add reference count helpers Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 45 ++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 19833ac547f9..65e258e1fdc6 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -43,16 +43,24 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns) +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) #define ns_init_inum(__ns) \ _Generic((__ns), \ @@ -83,4 +91,21 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->count); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->count); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + #endif From 2e9e6972279fec7dd352ff05abe596e55988ec41 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:47 +0200 Subject: [PATCH 44/65] mnt: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/mount.h | 2 +- fs/namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 76bf863c9ae2..79c85639a7ba 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -143,7 +143,7 @@ static inline void detach_mounts(struct dentry *dentry) static inline void get_mnt_ns(struct mnt_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern seqlock_t mount_lock; diff --git a/fs/namespace.c b/fs/namespace.c index b9f94769ec11..9109069d85cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2110,7 +2110,7 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr * the mount namespace and it might already be on its * deathbed. */ - if (!refcount_inc_not_zero(&mntns->ns.count)) + if (!ns_ref_get(mntns)) continue; return mntns; @@ -6084,7 +6084,7 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - if (!refcount_dec_and_test(&ns->ns.count)) + if (!ns_ref_put(ns)) return; namespace_lock(); emptied_ns = ns; From be5f21d3985f00827e09b798f7a07ebd6dd7f54a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:08 +0200 Subject: [PATCH 45/65] ns: add ns_common_free() And drop ns_free_inum(). Anything common that can be wasted centrally should be wasted in the new common helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 2 -- ipc/namespace.c | 4 ++-- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 5 +++++ kernel/pid_namespace.c | 4 ++-- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 4 ++-- kernel/utsname.c | 2 +- net/core/net_namespace.c | 4 ++-- 11 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 699b8c770c47..b9f94769ec11 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4082,7 +4082,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } @@ -4154,7 +4154,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 05c7a7dd211b..19833ac547f9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -41,6 +41,7 @@ struct ns_common { }; int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -80,4 +81,6 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, #define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + #endif diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 9f21670b5824..08016f6e0e6f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,8 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) #endif /* _LINUX_PROC_NS_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index 0f8bbd18a475..09d261a1a2aa 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -97,7 +97,7 @@ fail_mq: fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index d928c557e28b..16ead7508371 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -40,7 +40,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c3a90bb665ad..7c1b07e2a6c9 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -18,3 +18,8 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, } return proc_alloc_inum(&ns->inum); } + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 170757c265c2..27e2dd9ee051 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -127,7 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -152,7 +152,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index ce8e952104a7..d49c73015d6e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -255,7 +255,7 @@ void free_time_ns(struct time_namespace *ns) ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index db9f0463219c..32406bcab526 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -165,7 +165,7 @@ fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -220,7 +220,7 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index 399888be66bd..95d733eb2c98 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -98,7 +98,7 @@ void free_uts_ns(struct uts_namespace *ns) ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e50897fba8cd..a6a3de56a81c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -590,7 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: - ns_free_inum(&net->ns); + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -713,7 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - ns_free_inum(&net->ns); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); From 06099e374f3ab818f0501671b21493ba2e1b94b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:48 +0200 Subject: [PATCH 46/65] cgroup: port to ns_ref_*() helpers Stop accessing ns.count directly. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index c02bb76c5e32..b7dbf4d623d2 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -29,12 +29,12 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_cgroup_ns(ns); } From d4825c99d6a738c565d5142ce37369368a4352da Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:49 +0200 Subject: [PATCH 47/65] ipc: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ipc_namespace.h | 4 ++-- ipc/namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 924e4754374f..21eff63f47da 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -140,14 +140,14 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 09d261a1a2aa..bd85d1c9d2c2 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -199,7 +199,7 @@ static void free_ipc(struct work_struct *unused) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); From 07897b38eadf5a370a6001790239f23036d5b970 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:50 +0200 Subject: [PATCH 48/65] pid: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 2 +- kernel/pid_namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index ba0efc8c8596..5b2f29d369c4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -62,7 +62,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 27e2dd9ee051..162f5fb63d75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -169,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } struct pid_namespace *copy_pid_ns(unsigned long flags, @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); From e0c173f1fa02c0b08720aa8aa0cc91c3063146ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:51 +0200 Subject: [PATCH 49/65] time: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index a47a4ce4183e..f3b9567cf1f4 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -44,7 +44,7 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -57,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } From 96d997ea5ad1911cc393ffdb5c928b532f2f921a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:52 +0200 Subject: [PATCH 50/65] user: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 4 ++-- kernel/user_namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a09056ad090e..9a9aebbf96b9 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -176,7 +176,7 @@ static inline struct user_namespace *to_user_ns(struct ns_common *ns) static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -186,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 32406bcab526..f9df45c46235 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static void free_user_ns(struct work_struct *work) kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) From 83914de1c1d39dca4a3196a03bcd64d0a861d551 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:53 +0200 Subject: [PATCH 51/65] net-sysfs: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..3c2dc4c5e683 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); From dc41b844da530e94f5b8384deb2af602cbeb312a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:54 +0200 Subject: [PATCH 52/65] net: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a6a3de56a81c..d5e3fd819163 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); From f12021e68a13f4f867b8d55212254f1f83f75e00 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:55 +0200 Subject: [PATCH 53/65] ipv4: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/ipv4/inet_timewait_sock.c | 4 ++-- net/ipv4/tcp_metrics.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..56a117560c0c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -329,13 +329,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (refcount_read(&sock_net(sk)->ns.count)) + if (check_net(sock_net(sk))) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (refcount_read(&sock_net(sk)->ns.count)) { + if (check_net(sock_net(sk))) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6..b67f94c60f9f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->ns.count); + !check_net(tm_net(tm)); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); From 2438b7d63ad866d6b2bb7b8d3455a6365d9b0fbe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:56 +0200 Subject: [PATCH 54/65] uts: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h index c2b619bb4e57..23b4f0e1b338 100644 --- a/include/linux/uts_namespace.h +++ b/include/linux/uts_namespace.h @@ -25,7 +25,7 @@ static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) static inline void get_uts_ns(struct uts_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern struct uts_namespace *copy_utsname(unsigned long flags, @@ -34,7 +34,7 @@ extern void free_uts_ns(struct uts_namespace *ns); static inline void put_uts_ns(struct uts_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_uts_ns(ns); } From 99d33ce100cbc982647c9299cadb1277cfad503e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:57 +0200 Subject: [PATCH 55/65] net: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/net/net_namespace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fd090ceb80bf..3e7c825e5810 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -270,7 +270,7 @@ static inline struct net *to_net_ns(struct ns_common *ns) /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -281,7 +281,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -289,7 +289,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -301,7 +301,7 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } void net_drop_ns(void *); From b3d8ff0679507a21c1917f6ae0b56ed9b9c625f6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:58 +0200 Subject: [PATCH 56/65] nsfs: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/nsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 8484bc4dd3de..dc0a4404b971 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -492,7 +492,7 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); - if (!refcount_inc_not_zero(&ns->count)) + if (!__ns_ref_get(ns)) return NULL; } From 024596a4e2802e457a9f92af79f246fa9631f8de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:59 +0200 Subject: [PATCH 57/65] ns: rename to __ns_ref Make it easier to grep and rename to ns_count. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 12 ++++++------ init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/nscommon.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9109069d85cd..740a6ba524d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6015,7 +6015,7 @@ struct mnt_namespace init_mnt_ns = { .ns.inum = PROC_MNT_INIT_INO, .ns.ops = &mntns_operations, .user_ns = &init_user_ns, - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 65e258e1fdc6..aea8528d799a 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -29,7 +29,7 @@ struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ union { struct { u64 ns_id; @@ -93,19 +93,19 @@ void __ns_common_free(struct ns_common *ns); static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->count); + return refcount_dec_and_test(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->count); + return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) #define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) #endif diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 043cbf80a766..547e522e6016 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,7 @@ #include struct uts_namespace init_uts_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index bbf61275df41..d0f7dcf4c208 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 092e6bf081ed..a0e24adceef0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -219,7 +219,7 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7c1b07e2a6c9..7aa2be6a0c32 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - refcount_set(&ns->count, 1); + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..e222426f745d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d49c73015d6e..d70bdfb7b001 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -480,7 +480,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..17a742fb4e10 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,7 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, From d093090ea79979ca7198b03bb2c31cd518b80b59 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 12:01:38 +0200 Subject: [PATCH 58/65] selftests/namespaces: verify initial namespace inode numbers Make sure that all works correctly. Signed-off-by: Christian Brauner --- tools/testing/selftests/namespaces/.gitignore | 1 + tools/testing/selftests/namespaces/Makefile | 2 +- .../selftests/namespaces/init_ino_test.c | 61 +++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/namespaces/init_ino_test.c diff --git a/tools/testing/selftests/namespaces/.gitignore b/tools/testing/selftests/namespaces/.gitignore index 7639dbf58bbf..ccfb40837a73 100644 --- a/tools/testing/selftests/namespaces/.gitignore +++ b/tools/testing/selftests/namespaces/.gitignore @@ -1,2 +1,3 @@ nsid_test file_handle_test +init_ino_test diff --git a/tools/testing/selftests/namespaces/Makefile b/tools/testing/selftests/namespaces/Makefile index f6c117ce2c2b..5fe4b3dc07d3 100644 --- a/tools/testing/selftests/namespaces/Makefile +++ b/tools/testing/selftests/namespaces/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -TEST_GEN_PROGS := nsid_test file_handle_test +TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test include ../lib.mk diff --git a/tools/testing/selftests/namespaces/init_ino_test.c b/tools/testing/selftests/namespaces/init_ino_test.c new file mode 100644 index 000000000000..5b6993c3740b --- /dev/null +++ b/tools/testing/selftests/namespaces/init_ino_test.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +struct ns_info { + const char *name; + const char *proc_path; + unsigned int expected_ino; +}; + +static struct ns_info namespaces[] = { + { "ipc", "/proc/1/ns/ipc", IPC_NS_INIT_INO }, + { "uts", "/proc/1/ns/uts", UTS_NS_INIT_INO }, + { "user", "/proc/1/ns/user", USER_NS_INIT_INO }, + { "pid", "/proc/1/ns/pid", PID_NS_INIT_INO }, + { "cgroup", "/proc/1/ns/cgroup", CGROUP_NS_INIT_INO }, + { "time", "/proc/1/ns/time", TIME_NS_INIT_INO }, + { "net", "/proc/1/ns/net", NET_NS_INIT_INO }, + { "mnt", "/proc/1/ns/mnt", MNT_NS_INIT_INO }, +}; + +TEST(init_namespace_inodes) +{ + struct stat st; + + for (int i = 0; i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + int ret = stat(namespaces[i].proc_path, &st); + + /* Some namespaces might not be available (e.g., time namespace on older kernels) */ + if (ret < 0) { + if (errno == ENOENT) { + ksft_test_result_skip("%s namespace not available\n", + namespaces[i].name); + continue; + } + ASSERT_GE(ret, 0) + TH_LOG("Failed to stat %s: %s", + namespaces[i].proc_path, strerror(errno)); + } + + ASSERT_EQ(st.st_ino, namespaces[i].expected_ino) + TH_LOG("Namespace %s has inode 0x%lx, expected 0x%x", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + + ksft_print_msg("Namespace %s: inode 0x%lx matches expected 0x%x\n", + namespaces[i].name, st.st_ino, namespaces[i].expected_ino); + } +} + +TEST_HARNESS_MAIN From 7cf730321132e726ff949c6f3c0d5c598788f7a2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Sep 2025 11:29:49 +0200 Subject: [PATCH 59/65] ns: use inode initializer for initial namespaces Just use the common helper we have. Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 740a6ba524d0..271cd6294c8a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6012,7 +6012,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, } struct mnt_namespace init_mnt_ns = { - .ns.inum = PROC_MNT_INIT_INO, + .ns.inum = ns_init_inum(&init_mnt_ns), .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 547e522e6016..376b7c856d4d 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -18,7 +18,7 @@ struct uts_namespace init_uts_ns = { .domainname = UTS_DOMAINNAME, }, .user_ns = &init_user_ns, - .ns.inum = PROC_UTS_INIT_INO, + .ns.inum = ns_init_inum(&init_uts_ns), #ifdef CONFIG_UTS_NS .ns.ops = &utsns_operations, #endif diff --git a/ipc/msgutil.c b/ipc/msgutil.c index d0f7dcf4c208..dca6c8ec8f5f 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -29,7 +29,7 @@ DEFINE_SPINLOCK(mq_lock); struct ipc_namespace init_ipc_ns = { .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, - .ns.inum = PROC_IPC_INIT_INO, + .ns.inum = ns_init_inum(&init_ipc_ns), #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a0e24adceef0..245b43ff2fa4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -222,7 +222,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, }; diff --git a/kernel/pid.c b/kernel/pid.c index e222426f745d..7e8c66e0bf67 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -77,7 +77,7 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, + .ns.inum = ns_init_inum(&init_pid_ns), #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d70bdfb7b001..7aa4d6fedd49 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -482,7 +482,7 @@ const struct proc_ns_operations timens_for_children_operations = { struct time_namespace init_time_ns = { .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, + .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/user.c b/kernel/user.c index 17a742fb4e10..b2a53674d506 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -68,7 +68,7 @@ struct user_namespace init_user_ns = { .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, + .ns.inum = ns_init_inum(&init_user_ns), #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif From 8535bd38b4d58a3d19bf8e7dfa66e1d8180b316a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:35 +0200 Subject: [PATCH 60/65] cgroup: add missing ns_common include Add the missing include of the ns_common header. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index b7dbf4d623d2..81ccbdee425b 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -2,6 +2,8 @@ #ifndef _LINUX_CGROUP_NAMESPACE_H #define _LINUX_CGROUP_NAMESPACE_H +#include + struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; From d7610cb7454bbd8bf6d58f71b0ed57155d3c545f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:36 +0200 Subject: [PATCH 61/65] ns: simplify ns_common_init() further Simply derive the ns operations from the namespace type. Acked-by: Thomas Gleixner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 28 +++++++++++++++++++++++++--- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 9 +-------- 9 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 271cd6294c8a..d65917ec5544 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,9 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else - ret = ns_common_init(new_ns, &mntns_operations); + ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index aea8528d799a..56492cd9ff8d 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -25,6 +25,17 @@ extern struct time_namespace init_time_ns; extern struct user_namespace init_user_ns; extern struct uts_namespace init_uts_ns; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -84,10 +95,21 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) -#define ns_common_init(__ns, __ops) \ - __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/ipc/namespace.c b/ipc/namespace.c index bd85d1c9d2c2..d89dfd718d2b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(ns, &ipcns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 16ead7508371..04c98338ac08 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(new_ns, &cgroupns_operations); + ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 162f5fb63d75..a262a3f19443 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(ns, &pidns_operations); + err = ns_common_init(ns); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 7aa4d6fedd49..9f26e61be044 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(ns, &timens_operations); + err = ns_common_init(ns); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f9df45c46235..e1559e8a8a02 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(ns, &userns_operations); + ret = ns_common_init(ns); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index 95d733eb2c98..00001592ad13 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(ns, &utsns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d5e3fd819163..bdea7d5fac56 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -400,16 +400,9 @@ static __net_init void preinit_net_sysctl(struct net *net) /* init code that must occur even if setup_net() is not called. */ static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { - const struct proc_ns_operations *ns_ops; int ret; -#ifdef CONFIG_NET_NS - ns_ops = &netns_operations; -#else - ns_ops = NULL; -#endif - - ret = ns_common_init(net, ns_ops); + ret = ns_common_init(net); if (ret) return ret; From 5890f504ef543190beae2a4e244bbfa7c3e0b57c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:37 +0200 Subject: [PATCH 62/65] ns: add ns_debug() Add ns_debug() that asserts that the correct operations are used for the namespace type. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/nscommon.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7aa2be6a0c32..3cef89ddef41 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -2,6 +2,55 @@ #include #include +#include + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + default: + VFS_WARN_ON_ONCE(true); + } +} +#endif int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { @@ -12,6 +61,10 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + if (inum) { ns->inum = inum; return 0; From 10cdfcd37ade7ce736bc4a1927680f390a6b1f7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:58 +0200 Subject: [PATCH 63/65] nstree: make struct ns_tree private Don't expose it directly. There's no need to do that. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nstree.h | 13 ------------- kernel/nstree.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 29ad6402260c..8b8636690473 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,19 +9,6 @@ #include #include -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; -}; - extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; diff --git a/kernel/nstree.c b/kernel/nstree.c index bbe8bedc924c..ecc88b013eff 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -4,6 +4,20 @@ #include #include +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + struct ns_tree mnt_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), From 4055526d35746ce8b04bfa5e14e14f28bb163186 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:59 +0200 Subject: [PATCH 64/65] ns: move ns type into struct ns_common It's misplaced in struct proc_ns_operations and ns->ops might be NULL if the namespace is compiled out but we still want to know the type of the namespace for the initial namespace struct. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/nsfs.c | 18 +++++++++--------- include/linux/ns_common.h | 28 ++++++++++++++++++++++++---- include/linux/proc_ns.h | 1 - init/version-timestamp.c | 1 + ipc/msgutil.c | 1 + ipc/namespace.c | 1 - kernel/cgroup/cgroup.c | 1 + kernel/cgroup/namespace.c | 1 - kernel/nscommon.c | 5 +++-- kernel/nsproxy.c | 4 ++-- kernel/nstree.c | 8 ++++---- kernel/pid.c | 1 + kernel/pid_namespace.c | 2 -- kernel/time/namespace.c | 3 +-- kernel/user.c | 1 + kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - net/core/net_namespace.c | 1 - 19 files changed, 51 insertions(+), 34 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d65917ec5544..01334d5038a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4927,7 +4927,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5830,7 +5830,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -6016,6 +6016,7 @@ struct mnt_namespace init_mnt_ns = { .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), @@ -6333,7 +6334,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/nsfs.c b/fs/nsfs.c index dc0a4404b971..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -219,9 +219,9 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; @@ -234,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -273,7 +273,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return ret; } case NS_GET_MNTNS_ID: - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { @@ -293,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -314,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -453,7 +453,7 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } fid->ns_id = ns->ns_id; - fid->ns_type = ns->ops->type; + fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } @@ -489,14 +489,14 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); if (!__ns_ref_get(ns)) return NULL; } - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 56492cd9ff8d..f5b68b8abb54 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ #include #include +#include struct proc_ns_operations; @@ -37,6 +38,7 @@ extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; @@ -51,7 +53,7 @@ struct ns_common { }; }; -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ @@ -106,10 +108,28 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init(__ns) \ - __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) -#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 08016f6e0e6f..e81b8e596e4f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 376b7c856d4d..d071835121c2 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,6 +8,7 @@ #include struct uts_namespace init_uts_ns = { + .ns.ns_type = ns_common_type(&init_uts_ns), .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index dca6c8ec8f5f..7a03f6d03de3 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -33,6 +33,7 @@ struct ipc_namespace init_ipc_ns = { #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index d89dfd718d2b..76abac74a5c3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -248,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245b43ff2fa4..9b75102e81cb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -224,6 +224,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.ops = &cgroupns_operations, .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 04c98338ac08..241ca05f07c8 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -137,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 3cef89ddef41..92c9df1e8774 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -7,7 +7,7 @@ #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); @@ -52,12 +52,13 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) } #endif -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; + ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8d62449237b6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c index ecc88b013eff..b24a320a11a6 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -106,7 +106,7 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree->ns_tree_lock); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -128,7 +128,7 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree->ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); @@ -197,7 +197,7 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) if (!node) return NULL; - VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); return node_to_ns(node); } @@ -225,7 +225,7 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, if (list_is_head(list, &ns_tree->ns_list)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); return list_entry_rcu(list, struct ns_common, ns_list_node); } diff --git a/kernel/pid.c b/kernel/pid.c index 7e8c66e0bf67..0c2dcddb317a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a262a3f19443..f5b222c8ac39 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -443,7 +443,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -454,7 +453,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 9f26e61be044..530cf99c2212 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -462,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -472,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -480,6 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { + .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), diff --git a/kernel/user.c b/kernel/user.c index b2a53674d506..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,6 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, + .ns.ns_type = ns_common_type(&init_user_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e1559e8a8a02..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1400,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, diff --git a/kernel/utsname.c b/kernel/utsname.c index 00001592ad13..a8cdc84648ee 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -146,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bdea7d5fac56..dfe84bd35f98 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1543,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, From af075603f27b0f6e05f1bdf64bad42fa7cfb033b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:34:00 +0200 Subject: [PATCH 65/65] ns: drop assert Otherwise we warn when e.g., no namespaces are configured but the initial namespace for is still around. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- kernel/nscommon.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 92c9df1e8774..c1fb2bad6d72 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -46,8 +46,6 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) VFS_WARN_ON_ONCE(ops != &utsns_operations); break; #endif - default: - VFS_WARN_ON_ONCE(true); } } #endif