Unverified Commit 2cc0b7fd authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "mount notification"

Miklos Szeredi <mszeredi@redhat.com> says:

This should be ready for adding to the v6.15 queue.  I don't see the
SELinux discussion converging, so I took the simpler version out of the
two that were suggested.

* patches from https://lore.kernel.org/r/20250129165803.72138-1-mszeredi@redhat.com:
  vfs: add notifications for mount attach and detach
  fanotify: notify on mount attach and detach
  fsnotify: add mount notification infrastructure

Link: https://lore.kernel.org/r/20250129165803.72138-1-mszeredi@redhat.com


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 2014c95a bf630c40
Loading
Loading
Loading
Loading
+26 −0
Original line number Diff line number Diff line
@@ -5,6 +5,8 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

extern struct list_head notify_list;

struct mnt_namespace {
	struct ns_common	ns;
	struct mount *	root;
@@ -21,6 +23,10 @@ struct mnt_namespace {
		struct rcu_head		mnt_ns_rcu;
	};
	u64 event;
#ifdef CONFIG_FSNOTIFY
	__u32			n_fsnotify_mask;
	struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
#endif
	unsigned int		nr_mounts; /* # of mounts in the namespace */
	unsigned int		pending_mounts;
	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
@@ -76,6 +82,8 @@ struct mount {
#ifdef CONFIG_FSNOTIFY
	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
	__u32 mnt_fsnotify_mask;
	struct list_head to_notify;	/* need to queue notification */
	struct mnt_namespace *prev_ns;	/* previous namespace (NULL if none) */
#endif
	int mnt_id;			/* mount identifier, reused */
	u64 mnt_id_unique;		/* mount ID unique until reboot */
@@ -177,3 +185,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
	return container_of(ns, struct mnt_namespace, ns);
}

#ifdef CONFIG_FSNOTIFY
static inline void mnt_notify_add(struct mount *m)
{
	/* Optimize the case where there are no watches */
	if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
	    (m->prev_ns && m->prev_ns->n_fsnotify_marks))
		list_add_tail(&m->to_notify, &notify_list);
	else
		m->prev_ns = m->mnt_ns;
}
#else
static inline void mnt_notify_add(struct mount *m)
{
}
#endif

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
+89 −4
Original line number Diff line number Diff line
@@ -81,6 +81,9 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);

#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */

@@ -163,6 +166,7 @@ static void mnt_ns_release(struct mnt_namespace *ns)
{
	/* keep alive for {list,stat}mount() */
	if (refcount_dec_and_test(&ns->passive)) {
		fsnotify_mntns_delete(ns);
		put_user_ns(ns->user_ns);
		kfree(ns);
	}
@@ -1176,6 +1180,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
		ns->mnt_first_node = &mnt->mnt_node;
	rb_link_node(&mnt->mnt_node, parent, link);
	rb_insert_color(&mnt->mnt_node, &ns->mounts);

	mnt_notify_add(mnt);
}

/*
@@ -1723,6 +1729,50 @@ int may_umount(struct vfsmount *mnt)

EXPORT_SYMBOL(may_umount);

#ifdef CONFIG_FSNOTIFY
static void mnt_notify(struct mount *p)
{
	if (!p->prev_ns && p->mnt_ns) {
		fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
	} else if (p->prev_ns && !p->mnt_ns) {
		fsnotify_mnt_detach(p->prev_ns, &p->mnt);
	} else if (p->prev_ns == p->mnt_ns) {
		fsnotify_mnt_move(p->mnt_ns, &p->mnt);
	} else {
		fsnotify_mnt_detach(p->prev_ns, &p->mnt);
		fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
	}
	p->prev_ns = p->mnt_ns;
}

static void notify_mnt_list(void)
{
	struct mount *m, *tmp;
	/*
	 * Notify about mounts that were added/reparented/detached/remain
	 * connected after unmount.
	 */
	list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
		mnt_notify(m);
		list_del_init(&m->to_notify);
	}
}

static bool need_notify_mnt_list(void)
{
	return !list_empty(&notify_list);
}
#else
static void notify_mnt_list(void)
{
}

static bool need_notify_mnt_list(void)
{
	return false;
}
#endif

static void namespace_unlock(void)
{
	struct hlist_head head;
@@ -1733,7 +1783,18 @@ static void namespace_unlock(void)
	hlist_move_list(&unmounted, &head);
	list_splice_init(&ex_mountpoints, &list);

	if (need_notify_mnt_list()) {
		/*
		 * No point blocking out concurrent readers while notifications
		 * are sent. This will also allow statmount()/listmount() to run
		 * concurrently.
		 */
		downgrade_write(&namespace_sem);
		notify_mnt_list();
		up_read(&namespace_sem);
	} else {
		up_write(&namespace_sem);
	}

	shrink_dentry_list(&list);

@@ -1846,6 +1907,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
		change_mnt_propagation(p, MS_PRIVATE);
		if (disconnect)
			hlist_add_head(&p->mnt_umount, &unmounted);

		/*
		 * At this point p->mnt_ns is NULL, notification will be queued
		 * only if
		 *
		 *  - p->prev_ns is non-NULL *and*
		 *  - p->prev_ns->n_fsnotify_marks is non-NULL
		 *
		 * This will preclude queuing the mount if this is a cleanup
		 * after a failed copy_tree() or destruction of an anonymous
		 * namespace, etc.
		 */
		mnt_notify_add(p);
	}
}

@@ -2145,16 +2219,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
	}
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
{
	if (!is_mnt_ns_file(dentry))
		return NULL;

	return to_mnt_ns(get_proc_ns(dentry->d_inode));
}

static bool mnt_ns_loop(struct dentry *dentry)
{
	/* Could bind mounting the mount namespace inode cause a
	 * mount namespace loop?
	 */
	struct mnt_namespace *mnt_ns;
	if (!is_mnt_ns_file(dentry))
	struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);

	if (!mnt_ns)
		return false;

	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

@@ -2547,6 +2629,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
			dest_mp = smp;
		unhash_mnt(source_mnt);
		attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
		mnt_notify_add(source_mnt);
		touch_mnt_namespace(source_mnt->mnt_ns);
	} else {
		if (source_mnt->mnt_ns) {
@@ -4468,6 +4551,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
	list_del_init(&new_mnt->mnt_expire);
	put_mountpoint(root_mp);
	unlock_mount_hash();
	mnt_notify_add(root_mnt);
	mnt_notify_add(new_mnt);
	chroot_fs_refs(&root, &new);
	error = 0;
out4:
+35 −3
Original line number Diff line number Diff line
@@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old,
	case FANOTIFY_EVENT_TYPE_FS_ERROR:
		return fanotify_error_event_equal(FANOTIFY_EE(old),
						  FANOTIFY_EE(new));
	case FANOTIFY_EVENT_TYPE_MNT:
		return false;
	default:
		WARN_ON_ONCE(1);
	}
@@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
	pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
		 __func__, iter_info->report_mask, event_mask, data, data_type);

	if (!fid_mode) {
	if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
		if (data_type != FSNOTIFY_EVENT_MNT)
			return 0;
	} else if (!fid_mode) {
		/* Do we have path to open a file descriptor? */
		if (!path)
			return 0;
@@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
	return &pevent->fae;
}

static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp)
{
	struct fanotify_mnt_event *pevent;

	pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp);
	if (!pevent)
		return NULL;

	pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT;
	pevent->mnt_id = mnt_id;

	return &pevent->fae;
}

static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
							int data_type,
							gfp_t gfp)
@@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event(
					      fid_mode);
	struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
	const struct path *path = fsnotify_data_path(data, data_type);
	u64 mnt_id = fsnotify_data_mnt_id(data, data_type);
	struct mem_cgroup *old_memcg;
	struct dentry *moved = NULL;
	struct inode *child = NULL;
@@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event(
						  moved, &hash, gfp);
	} else if (fid_mode) {
		event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
	} else {
	} else if (path) {
		event = fanotify_alloc_path_event(path, &hash, gfp);
	} else if (mnt_id) {
		event = fanotify_alloc_mnt_event(mnt_id, gfp);
	} else {
		WARN_ON_ONCE(1);
	}

	if (!event)
@@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
	BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
	BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);

	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
	BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24);

	mask = fanotify_group_event_mask(group, iter_info, &match_mask,
					 mask, data, data_type, dir);
@@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group,
	mempool_free(fee, &group->fanotify_data.error_events_pool);
}

static void fanotify_free_mnt_event(struct fanotify_event *event)
{
	kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event));
}

static void fanotify_free_event(struct fsnotify_group *group,
				struct fsnotify_event *fsn_event)
{
@@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group,
	case FANOTIFY_EVENT_TYPE_FS_ERROR:
		fanotify_free_error_event(group, event);
		break;
	case FANOTIFY_EVENT_TYPE_MNT:
		fanotify_free_mnt_event(event);
		break;
	default:
		WARN_ON_ONCE(1);
	}
+18 −0
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_fid_event_cachep;
extern struct kmem_cache *fanotify_path_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
extern struct kmem_cache *fanotify_mnt_event_cachep;

/* Possible states of the permission event */
enum {
@@ -244,6 +245,7 @@ enum fanotify_event_type {
	FANOTIFY_EVENT_TYPE_PATH_PERM,
	FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
	FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
	FANOTIFY_EVENT_TYPE_MNT,
	__FANOTIFY_EVENT_TYPE_NUM
};

@@ -409,12 +411,23 @@ struct fanotify_path_event {
	struct path path;
};

struct fanotify_mnt_event {
	struct fanotify_event fae;
	u64 mnt_id;
};

static inline struct fanotify_path_event *
FANOTIFY_PE(struct fanotify_event *event)
{
	return container_of(event, struct fanotify_path_event, fae);
}

static inline struct fanotify_mnt_event *
FANOTIFY_ME(struct fanotify_event *event)
{
	return container_of(event, struct fanotify_mnt_event, fae);
}

/*
 * Structure for permission fanotify events. It gets allocated and freed in
 * fanotify_handle_event() since we wait there for user response. When the
@@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask)
	return mask & FAN_FS_ERROR;
}

static inline bool fanotify_is_mnt_event(u32 mask)
{
	return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH);
}

static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
	if (event->type == FANOTIFY_EVENT_TYPE_PATH)
+75 −14
Original line number Diff line number Diff line
@@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init;
struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;

#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
	(sizeof(struct fanotify_event_info_error))
#define FANOTIFY_RANGE_INFO_LEN \
	(sizeof(struct fanotify_event_info_range))
#define FANOTIFY_MNT_INFO_LEN \
	(sizeof(struct fanotify_event_info_mnt))

static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode,
		fh_len = fanotify_event_object_fh_len(event);
		event_len += fanotify_fid_info_len(fh_len, dot_len);
	}
	if (fanotify_is_mnt_event(event->mask))
		event_len += FANOTIFY_MNT_INFO_LEN;

	if (info_mode & FAN_REPORT_PIDFD)
		event_len += FANOTIFY_PIDFD_INFO_LEN;
@@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group,
	return -ENOENT;
}

static size_t copy_mnt_info_to_user(struct fanotify_event *event,
				    char __user *buf, int count)
{
	struct fanotify_event_info_mnt info = { };

	info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
	info.hdr.len = FANOTIFY_MNT_INFO_LEN;

	if (WARN_ON(count < info.hdr.len))
		return -EFAULT;

	info.mnt_id = FANOTIFY_ME(event)->mnt_id;

	if (copy_to_user(buf, &info, sizeof(info)))
		return -EFAULT;

	return info.hdr.len;
}

static size_t copy_error_info_to_user(struct fanotify_event *event,
				      char __user *buf, int count)
{
@@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
		total_bytes += ret;
	}

	if (fanotify_is_mnt_event(event->mask)) {
		ret = copy_mnt_info_to_user(event, buf, count);
		if (ret < 0)
			return ret;
		buf += ret;
		count -= ret;
		total_bytes += ret;
	}

	return total_bytes;
}

@@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
		return -EINVAL;

	/* Don't allow mixing mnt events with inode events for now */
	if (flags & FAN_REPORT_MNT) {
		if (class != FAN_CLASS_NOTIF)
			return -EINVAL;
		if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
			return -EINVAL;
	}

	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
		return -EINVAL;

@@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
			    int dfd, const char  __user *pathname)
{
	struct inode *inode = NULL;
	struct vfsmount *mnt = NULL;
	struct fsnotify_group *group;
	struct path path;
	struct fan_fsid __fsid, *fsid = NULL;
@@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
	unsigned int obj_type, fid_mode;
	void *obj;
	void *obj = NULL;
	u32 umask = 0;
	int ret;

@@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	case FAN_MARK_FILESYSTEM:
		obj_type = FSNOTIFY_OBJ_TYPE_SB;
		break;
	case FAN_MARK_MNTNS:
		obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
		break;
	default:
		return -EINVAL;
	}
@@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
		return -EINVAL;
	group = fd_file(f)->private_data;

	/* Only report mount events on mnt namespace */
	if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
		if (mask & ~FANOTIFY_MOUNT_EVENTS)
			return -EINVAL;
		if (mark_type != FAN_MARK_MNTNS)
			return -EINVAL;
	} else {
		if (mask & FANOTIFY_MOUNT_EVENTS)
			return -EINVAL;
		if (mark_type == FAN_MARK_MNTNS)
			return -EINVAL;
	}

	/*
	 * An unprivileged user is not allowed to setup mount nor filesystem
	 * marks.  This also includes setting up such marks by a group that
@@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	 * point.
	 */
	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
		return -EINVAL;

@@ -1938,17 +1994,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	}

	/* inode held in place by reference to path; group by fget on fd */
	if (mark_type == FAN_MARK_INODE) {
	if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
		inode = path.dentry->d_inode;
		obj = inode;
	} else {
		mnt = path.mnt;
		if (mark_type == FAN_MARK_MOUNT)
			obj = mnt;
		else
			obj = mnt->mnt_sb;
	} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
		obj = path.mnt;
	} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
		obj = path.mnt->mnt_sb;
	} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
		obj = mnt_ns_from_dentry(path.dentry);
	}

	ret = -EINVAL;
	if (!obj)
		goto path_put_and_out;

	/*
	 * If some other task has this inode open for write we should not add
	 * an ignore mask, unless that ignore mask is supposed to survive
@@ -1956,10 +2016,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	 */
	if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
		ret = mnt ? -EINVAL : -EISDIR;
		ret = !inode ? -EINVAL : -EISDIR;
		/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
		if (ignore == FAN_MARK_IGNORE &&
		    (mnt || S_ISDIR(inode->i_mode)))
		    (!inode || S_ISDIR(inode->i_mode)))
			goto path_put_and_out;

		ret = 0;
@@ -1968,7 +2028,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
	}

	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
	if (mnt || !S_ISDIR(inode->i_mode)) {
	if (!inode || !S_ISDIR(inode->i_mode)) {
		mask &= ~FAN_EVENT_ON_CHILD;
		umask = FAN_EVENT_ON_CHILD;
		/*
@@ -2042,7 +2102,7 @@ static int __init fanotify_user_setup(void)
				     FANOTIFY_DEFAULT_MAX_USER_MARKS);

	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);

	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
@@ -2055,6 +2115,7 @@ static int __init fanotify_user_setup(void)
		fanotify_perm_event_cachep =
			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
	}
	fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);

	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
Loading