Commit 3371fa2f authored by Al Viro's avatar Al Viro
Browse files

struct mount: relocate MNT_WRITE_HOLD bit



... from ->mnt_flags to LSB of ->mnt_pprev_for_sb.

This is safe - we always set and clear it within the same mount_lock
scope, so we won't interfere with list operations - traversals are
always forward, so they don't even look at ->mnt_prev_for_sb and
both insertions and removals are in mount_lock scopes of their own,
so that bit will be clear in *all* mount instances during those.

Reviewed-by: default avatarChristian Brauner <brauner@kernel.org>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent 09a1b33c
Loading
Loading
Loading
Loading
+24 −1
Original line number Diff line number Diff line
@@ -66,7 +66,8 @@ struct mount {
	struct list_head mnt_child;	/* and going through their mnt_child */
	struct mount *mnt_next_for_sb;	/* the next two fields are hlist_node, */
	struct mount * __aligned(1) *mnt_pprev_for_sb;
					/* except that LSB of pprev will be stolen */
					/* except that LSB of pprev is stolen */
#define WRITE_HOLD 1			/* ... for use by mnt_hold_writers() */
	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;
	struct list_head mnt_expire;	/* link in fs-specific expiry list */
@@ -244,4 +245,26 @@ static inline struct mount *topmost_overmount(struct mount *m)
	return m;
}

static inline bool __test_write_hold(struct mount * __aligned(1) *val)
{
	return (unsigned long)val & WRITE_HOLD;
}

static inline bool test_write_hold(const struct mount *m)
{
	return __test_write_hold(m->mnt_pprev_for_sb);
}

static inline void set_write_hold(struct mount *m)
{
	m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
				       | WRITE_HOLD);
}

static inline void clear_write_hold(struct mount *m)
{
	m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb
				       & ~WRITE_HOLD);
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
+17 −17
Original line number Diff line number Diff line
@@ -509,20 +509,20 @@ int mnt_get_write_access(struct vfsmount *m)
	mnt_inc_writers(mnt);
	/*
	 * The store to mnt_inc_writers must be visible before we pass
	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set MNT_WRITE_HOLD.
	 * WRITE_HOLD loop below, so that the slowpath can see our
	 * incremented count after it has set WRITE_HOLD.
	 */
	smp_mb();
	might_lock(&mount_lock.lock);
	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
	while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) {
		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
			cpu_relax();
		} else {
			/*
			 * This prevents priority inversion, if the task
			 * setting MNT_WRITE_HOLD got preempted on a remote
			 * setting WRITE_HOLD got preempted on a remote
			 * CPU, and it prevents life lock if the task setting
			 * MNT_WRITE_HOLD has a lower priority and is bound to
			 * WRITE_HOLD has a lower priority and is bound to
			 * the same CPU as the task that is spinning here.
			 */
			preempt_enable();
@@ -533,7 +533,7 @@ int mnt_get_write_access(struct vfsmount *m)
	}
	/*
	 * The barrier pairs with the barrier sb_start_ro_state_change() making
	 * sure that if we see MNT_WRITE_HOLD cleared, we will also see
	 * sure that if we see WRITE_HOLD cleared, we will also see
	 * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
	 * mnt_is_readonly() and bail in case we are racing with remount
	 * read-only.
@@ -672,15 +672,15 @@ EXPORT_SYMBOL(mnt_drop_write_file);
 * @mnt.
 *
 * Context: This function expects lock_mount_hash() to be held serializing
 *          setting MNT_WRITE_HOLD.
 *          setting WRITE_HOLD.
 * Return: On success 0 is returned.
 *	   On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{
	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
	set_write_hold(mnt);
	/*
	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
	 * After storing WRITE_HOLD, we'll read the counters. This store
	 * should be visible before we do.
	 */
	smp_mb();
@@ -696,9 +696,9 @@ static inline int mnt_hold_writers(struct mount *mnt)
	 * sum up each counter, if we read a counter before it is incremented,
	 * but then read another CPU's count which it has been subsequently
	 * decremented from -- we would see more decrements than we should.
	 * MNT_WRITE_HOLD protects against this scenario, because
	 * WRITE_HOLD protects against this scenario, because
	 * mnt_want_write first increments count, then smp_mb, then spins on
	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
	 * WRITE_HOLD, so it can't be decremented by another CPU while
	 * we're counting up here.
	 */
	if (mnt_get_writers(mnt) > 0)
@@ -720,14 +720,14 @@ static inline int mnt_hold_writers(struct mount *mnt)
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{
	if (!(mnt->mnt_flags & MNT_WRITE_HOLD))
	if (!test_write_hold(mnt))
		return;
	/*
	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
	 * MNT_READONLY must become visible before ~WRITE_HOLD, so writers
	 * that become unheld will see MNT_READONLY.
	 */
	smp_wmb();
	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
	clear_write_hold(mnt);
}

static inline void mnt_del_instance(struct mount *m)
@@ -766,7 +766,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
{
	int err = 0;

	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
	/* Racy optimization.  Recheck the counter under WRITE_HOLD */
	if (atomic_long_read(&sb->s_remove_count))
		return -EBUSY;

@@ -784,8 +784,8 @@ int sb_prepare_remount_readonly(struct super_block *sb)
	if (!err)
		sb_start_ro_state_change(sb);
	for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) {
		if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
			m->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
		if (test_write_hold(m))
			clear_write_hold(m);
	}
	unlock_mount_hash();

+1 −2
Original line number Diff line number Diff line
@@ -33,7 +33,6 @@ enum mount_flags {
	MNT_NOSYMFOLLOW	= 0x80,

	MNT_SHRINKABLE	= 0x100,
	MNT_WRITE_HOLD	= 0x200,

	MNT_INTERNAL	= 0x4000,

@@ -52,7 +51,7 @@ enum mount_flags {
				  | MNT_READONLY | MNT_NOSYMFOLLOW,
	MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,

	MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
	MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED |
			     MNT_SYNC_UMOUNT | MNT_LOCKED
};