Commit 9a0ddeb7 authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm, swap: hold a reference during scan and cleanup flag usage

The flag SWP_SCANNING was used as an indicator of whether a device is
being scanned for allocation, and prevents swapoff.  Combined with
SWP_WRITEOK, they work as a set of barriers for a clean swapoff:

1. Swapoff clears SWP_WRITEOK, allocation requests will see
   ~SWP_WRITEOK and abort as it's serialized by si->lock.
2. Swapoff unuses all allocated entries.
3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing
   allocations will stop, preventing UAF.
4. Now swapoff can free everything safely.

This will make the allocation path have a hard dependency on si->lock. 
Allocation always have to acquire si->lock first for setting SWP_SCANNING
and checking SWP_WRITEOK.

This commit removes this flag, and just uses the existing per-CPU refcount
instead to prevent UAF in step 3, which serves well for such usage without
dependency on si->lock, and scales very well too.  Just hold a reference
during the whole scan and allocation process.  Swapoff will kill and wait
for the counter.

And for preventing any allocation from happening after step 1 so the unuse
in step 2 can ensure all slots are free, swapoff will acquire the ci->lock
of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and
abort.

This way these dependences on si->lock are gone.  And worth noting we
can't kill the refcount as the first step for swapoff as the unuse process
have to acquire the refcount.

Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com


Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chis Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickens <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent b228386c
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -219,7 +219,6 @@ enum {
	SWP_STABLE_WRITES = (1 << 11),	/* no overwrite PG_writeback pages */
	SWP_SYNCHRONOUS_IO = (1 << 12),	/* synchronous IO is efficient */
					/* add others here before... */
	SWP_SCANNING	= (1 << 14),	/* refcount in scan_swap_map */
};

#define SWAP_CLUSTER_MAX 32UL
+57 −33
Original line number Diff line number Diff line
@@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
{
	unsigned int nr_pages = 1 << order;

	lockdep_assert_held(&ci->lock);

	if (!(si->flags & SWP_WRITEOK))
		return false;

@@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
{
	int n_ret = 0;

	si->flags += SWP_SCANNING;

	while (n_ret < nr) {
		unsigned long offset = cluster_alloc_swap_entry(si, order, usage);

@@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
		slots[n_ret++] = swp_entry(si->type, offset);
	}

	si->flags -= SWP_SCANNING;

	return n_ret;
}

@@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
	return cluster_alloc_swap(si, usage, nr, slots, order);
}

static bool get_swap_device_info(struct swap_info_struct *si)
{
	if (!percpu_ref_tryget_live(&si->users))
		return false;
	/*
	 * Guarantee the si->users are checked before accessing other
	 * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
	 * up to dated.
	 *
	 * Paired with the spin_unlock() after setup_swap_info() in
	 * enable_swap_info(), and smp_wmb() in swapoff.
	 */
	smp_rmb();
	return true;
}

int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
	int order = swap_entry_order(entry_order);
@@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
		/* requeue si to after same-priority siblings */
		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
		spin_unlock(&swap_avail_lock);
		if (get_swap_device_info(si)) {
			spin_lock(&si->lock);
			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
					n_goal, swp_entries, order);
			spin_unlock(&si->lock);
			put_swap_device(si);
			if (n_ret || size > 1)
				goto check_out;
			cond_resched();
		}

		spin_lock(&swap_avail_lock);
		/*
@@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
	si = swp_swap_info(entry);
	if (!si)
		goto bad_nofile;
	if (!percpu_ref_tryget_live(&si->users))
	if (!get_swap_device_info(si))
		goto out;
	/*
	 * Guarantee the si->users are checked before accessing other
	 * fields of swap_info_struct.
	 *
	 * Paired with the spin_unlock() after setup_swap_info() in
	 * enable_swap_info().
	 */
	smp_rmb();
	offset = swp_offset(entry);
	if (offset >= si->max)
		goto put_out;
@@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type)
		goto fail;

	/* This is called for allocating swap entry, not cache */
	if (get_swap_device_info(si)) {
		spin_lock(&si->lock);
		if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
			atomic_long_dec(&nr_swap_pages);
		spin_unlock(&si->lock);
		put_swap_device(si);
	}
fail:
	return entry;
}
@@ -2562,6 +2574,25 @@ bool has_usable_swap(void)
	return ret;
}

/*
 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
 * see the updated flags, so there will be no more allocations.
 */
static void wait_for_allocation(struct swap_info_struct *si)
{
	unsigned long offset;
	unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
	struct swap_cluster_info *ci;

	BUG_ON(si->flags & SWP_WRITEOK);

	for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
		ci = lock_cluster(si, offset);
		unlock_cluster(ci);
		offset += SWAPFILE_CLUSTER;
	}
}

SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
	struct swap_info_struct *p = NULL;
@@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
	spin_unlock(&p->lock);
	spin_unlock(&swap_lock);

	wait_for_allocation(p);

	disable_swap_slots_cache_lock();

	set_current_oom_origin();
@@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
	spin_lock(&p->lock);
	drain_mmlist();

	/* wait for anyone still in scan_swap_map_slots */
	while (p->flags >= SWP_SCANNING) {
		spin_unlock(&p->lock);
		spin_unlock(&swap_lock);
		schedule_timeout_uninterruptible(1);
		spin_lock(&swap_lock);
		spin_lock(&p->lock);
	}

	swap_file = p->swap_file;
	p->swap_file = NULL;
	p->max = 0;