Commit 2b3a58b1 authored by Kairui Song's avatar Kairui Song Committed by Andrew Morton
Browse files

mm/swap_cgroup: remove global swap cgroup lock

commit e9e58a4e ("memcg: avoid use cmpxchg in swap cgroup
maintainance") replaced the cmpxchg/xchg with a global irq spinlock
because some archs doesn't support 2 bytes cmpxchg/xchg.  Clearly this
won't scale well.

And as commented in swap_cgroup.c, this lock is not needed for map
synchronization.

Emulation of 2 bytes xchg with atomic cmpxchg isn't hard, so implement it
to get rid of this lock.  Introduced two helpers for doing so and they can
be easily dropped if a generic 2 byte xchg is support.

Testing using 64G brd and build with build kernel with make -j96 in 1.5G
memory cgroup using 4k folios showed below improvement (6 test run):

Before this series:
Sys time: 10782.29 (stdev 42.353886)
Real time: 171.49 (stdev 0.595541)

After this commit:
Sys time: 9617.23 (stdev 37.764062), -10.81%
Real time: 159.65 (stdev 0.587388), -6.90%

With 64k folios and 2G memcg:
Before this series:
Sys time: 8176.94 (stdev 26.414712)
Real time: 141.98 (stdev 0.797382)

After this commit:
Sys time: 7358.98 (stdev 54.927593), -10.00%
Real time: 134.07 (stdev 0.757463), -5.57%

Sequential swapout of 8G 64k zero folios with madvise (24 test run):
Before this series:
5461409.12 us (stdev 183957.827084)

After this commit:
5420447.26 us (stdev 196419.240317)

Sequential swapin of 8G 4k zero folios (24 test run):
Before this series:
19736958.916667 us (stdev 189027.246676)

After this commit:
19662182.629630 us (stdev 172717.640614)

Performance is better or at least not worse for all tests above.

Link: https://lkml.kernel.org/r/20241218114633.85196-4-ryncsn@gmail.com


Signed-off-by: default avatarKairui Song <kasong@tencent.com>
Reviewed-by: default avatarRoman Gushchin <roman.gushchin@linux.dev>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 40733e7e
Loading
Loading
Loading
Loading
+49 −28
Original line number Diff line number Diff line
@@ -7,19 +7,20 @@

static DEFINE_MUTEX(swap_cgroup_mutex);

/* Pack two cgroup id (short) of two entries in one swap_cgroup (atomic_t) */
#define ID_PER_SC (sizeof(struct swap_cgroup) / sizeof(unsigned short))
#define ID_SHIFT (BITS_PER_TYPE(unsigned short))
#define ID_MASK (BIT(ID_SHIFT) - 1)
struct swap_cgroup {
	unsigned short		id;
	atomic_t ids;
};

struct swap_cgroup_ctrl {
	struct swap_cgroup *map;
	spinlock_t	lock;
};

static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];

#define SC_PER_PAGE	(PAGE_SIZE/sizeof(struct swap_cgroup))

/*
 * SwapCgroup implements "lookup" and "exchange" operations.
 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
@@ -30,19 +31,35 @@ static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 *    SwapCache(and its swp_entry) is under lock.
 *  - When called via swap_free(), there is no user of this entry and no race.
 * Then, we don't need lock around "exchange".
 *
 * TODO: we can push these buffers out to HIGHMEM.
 */
static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
					struct swap_cgroup_ctrl **ctrlp)
static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
					      pgoff_t offset)
{
	pgoff_t offset = swp_offset(ent);
	struct swap_cgroup_ctrl *ctrl;
	unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
	unsigned int old_ids = atomic_read(&map[offset / ID_PER_SC].ids);

	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
	if (ctrlp)
		*ctrlp = ctrl;
	return &ctrl->map[offset];
	BUILD_BUG_ON(!is_power_of_2(ID_PER_SC));
	BUILD_BUG_ON(sizeof(struct swap_cgroup) != sizeof(atomic_t));

	return (old_ids >> shift) & ID_MASK;
}

static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
					    pgoff_t offset,
					    unsigned short new_id)
{
	unsigned short old_id;
	struct swap_cgroup *sc = &map[offset / ID_PER_SC];
	unsigned int shift = (offset % ID_PER_SC) * ID_SHIFT;
	unsigned int new_ids, old_ids = atomic_read(&sc->ids);

	do {
		old_id = (old_ids >> shift) & ID_MASK;
		new_ids = (old_ids & ~(ID_MASK << shift));
		new_ids |= ((unsigned int)new_id) << shift;
	} while (!atomic_try_cmpxchg(&sc->ids, &old_ids, new_ids));

	return old_id;
}

/**
@@ -58,21 +75,19 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
				  unsigned int nr_ents)
{
	struct swap_cgroup_ctrl *ctrl;
	struct swap_cgroup *sc;
	unsigned short old;
	unsigned long flags;
	pgoff_t offset = swp_offset(ent);
	pgoff_t end = offset + nr_ents;
	unsigned short old, iter;
	struct swap_cgroup *map;

	sc = lookup_swap_cgroup(ent, &ctrl);
	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
	map = ctrl->map;

	spin_lock_irqsave(&ctrl->lock, flags);
	old = sc->id;
	for (; offset < end; offset++, sc++) {
		VM_BUG_ON(sc->id != old);
		sc->id = id;
	}
	spin_unlock_irqrestore(&ctrl->lock, flags);
	old = __swap_cgroup_id_lookup(map, offset);
	do {
		iter = __swap_cgroup_id_xchg(map, offset, id);
		VM_BUG_ON(iter != old);
	} while (++offset != end);

	return old;
}
@@ -85,9 +100,13 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
 */
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
	struct swap_cgroup_ctrl *ctrl;

	if (mem_cgroup_disabled())
		return 0;
	return lookup_swap_cgroup(ent, NULL)->id;

	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
	return __swap_cgroup_id_lookup(ctrl->map, swp_offset(ent));
}

int swap_cgroup_swapon(int type, unsigned long max_pages)
@@ -98,14 +117,16 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
	if (mem_cgroup_disabled())
		return 0;

	map = vcalloc(max_pages, sizeof(struct swap_cgroup));
	BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
		     sizeof(struct swap_cgroup));
	map = vcalloc(DIV_ROUND_UP(max_pages, ID_PER_SC),
		      sizeof(struct swap_cgroup));
	if (!map)
		goto nomem;

	ctrl = &swap_cgroup_ctrl[type];
	mutex_lock(&swap_cgroup_mutex);
	ctrl->map = map;
	spin_lock_init(&ctrl->lock);
	mutex_unlock(&swap_cgroup_mutex);

	return 0;