Commit f5c12105 authored by Oscar Salvador's avatar Oscar Salvador Committed by Andrew Morton
Browse files

mm,page_owner: fix refcount imbalance

Current code does not contemplate scenarios were an allocation and free
operation on the same pages do not handle it in the same amount at once. 
To give an example, page_alloc_exact(), where we will allocate a page of
enough order to stafisfy the size request, but we will free the remainings
right away.

In the above example, we will increment the stack_record refcount only
once, but we will decrease it the same number of times as number of unused
pages we have to free.  This will lead to a warning because of refcount
imbalance.

Fix this by recording the number of base pages in the refcount field.

Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de


Reported-by: default avatar <syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com>
Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com


Fixes: 217b2119 ("mm,page_owner: implement the tracking of the stacks count")
Signed-off-by: default avatarOscar Salvador <osalvador@suse.de>
Reviewed-by: default avatarVlastimil Babka <vbabka@suse.cz>
Tested-by: default avatarAlexandre Ghiti <alexghiti@rivosinc.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent ea4b5b33
Loading
Loading
Loading
Loading
+38 −35
Original line number Diff line number Diff line
@@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
each page. It is already implemented and activated if page owner is
enabled. Other usages are more than welcome.

It can also be used to show all the stacks and their outstanding
allocations, which gives us a quick overview of where the memory is going
without the need to screen through all the pages and match the allocation
and free operation.
It can also be used to show all the stacks and their current number of
allocated base pages, which gives us a quick overview of where the memory
is going without the need to screen through all the pages and match the
allocation and free operation.

page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built
@@ -75,42 +75,45 @@ Usage

	cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
	cat stacks.txt
	 prep_new_page+0xa9/0x120
	 get_page_from_freelist+0x7e6/0x2140
	 __alloc_pages+0x18a/0x370
	 new_slab+0xc8/0x580
	 ___slab_alloc+0x1f2/0xaf0
	 __slab_alloc.isra.86+0x22/0x40
	 kmem_cache_alloc+0x31b/0x350
	 __khugepaged_enter+0x39/0x100
	 dup_mmap+0x1c7/0x5ce
	 copy_process+0x1afe/0x1c90
	 kernel_clone+0x9a/0x3c0
	 __do_sys_clone+0x66/0x90
	 do_syscall_64+0x7f/0x160
	 entry_SYSCALL_64_after_hwframe+0x6c/0x74
	stack_count: 234
	 post_alloc_hook+0x177/0x1a0
	 get_page_from_freelist+0xd01/0xd80
	 __alloc_pages+0x39e/0x7e0
	 allocate_slab+0xbc/0x3f0
	 ___slab_alloc+0x528/0x8a0
	 kmem_cache_alloc+0x224/0x3b0
	 sk_prot_alloc+0x58/0x1a0
	 sk_alloc+0x32/0x4f0
	 inet_create+0x427/0xb50
	 __sock_create+0x2e4/0x650
	 inet_ctl_sock_create+0x30/0x180
	 igmp_net_init+0xc1/0x130
	 ops_init+0x167/0x410
	 setup_net+0x304/0xa60
	 copy_net_ns+0x29b/0x4a0
	 create_new_namespaces+0x4a1/0x820
	nr_base_pages: 16
	...
	...
	echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
	cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
	cat stacks_7000.txt
	 prep_new_page+0xa9/0x120
	 get_page_from_freelist+0x7e6/0x2140
	 __alloc_pages+0x18a/0x370
	 alloc_pages_mpol+0xdf/0x1e0
	 folio_alloc+0x14/0x50
	 filemap_alloc_folio+0xb0/0x100
	 page_cache_ra_unbounded+0x97/0x180
	 filemap_fault+0x4b4/0x1200
	 __do_fault+0x2d/0x110
	 do_pte_missing+0x4b0/0xa30
	 __handle_mm_fault+0x7fa/0xb70
	 handle_mm_fault+0x125/0x300
	 do_user_addr_fault+0x3c9/0x840
	 exc_page_fault+0x68/0x150
	 asm_exc_page_fault+0x22/0x30
	stack_count: 8248
	 post_alloc_hook+0x177/0x1a0
	 get_page_from_freelist+0xd01/0xd80
	 __alloc_pages+0x39e/0x7e0
	 alloc_pages_mpol+0x22e/0x490
	 folio_alloc+0xd5/0x110
	 filemap_alloc_folio+0x78/0x230
	 page_cache_ra_order+0x287/0x6f0
	 filemap_get_pages+0x517/0x1160
	 filemap_read+0x304/0x9f0
	 xfs_file_buffered_read+0xe6/0x1d0 [xfs]
	 xfs_file_read_iter+0x1f0/0x380 [xfs]
	 __kernel_read+0x3b9/0x730
	 kernel_read_file+0x309/0x4d0
	 __do_sys_finit_module+0x381/0x730
	 do_syscall_64+0x8d/0x150
	 entry_SYSCALL_64_after_hwframe+0x62/0x6a
	nr_base_pages: 20824
	...

	cat /sys/kernel/debug/page_owner > page_owner_full.txt
+20 −14
Original line number Diff line number Diff line
@@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
	spin_unlock_irqrestore(&stack_list_lock, flags);
}

static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
				   int nr_base_pages)
{
	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

@@ -217,15 +218,20 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
			/* Add the new stack_record to our list */
			add_stack_record_to_list(stack_record, gfp_mask);
	}
	refcount_inc(&stack_record->count);
	refcount_add(nr_base_pages, &stack_record->count);
}

static void dec_stack_record_count(depot_stack_handle_t handle)
static void dec_stack_record_count(depot_stack_handle_t handle,
				   int nr_base_pages)
{
	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);

	if (stack_record)
		refcount_dec(&stack_record->count);
	if (!stack_record)
		return;

	if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
		pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
			handle);
}

static inline void __update_page_owner_handle(struct page_ext *page_ext,
@@ -306,7 +312,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
		 * the machinery is not ready yet, we cannot decrement
		 * their refcount either.
		 */
		dec_stack_record_count(alloc_handle);
		dec_stack_record_count(alloc_handle, 1 << order);
}

noinline void __set_page_owner(struct page *page, unsigned short order,
@@ -325,7 +331,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
				   current->pid, current->tgid, ts_nsec,
				   current->comm);
	page_ext_put(page_ext);
	inc_stack_record_count(handle, gfp_mask);
	inc_stack_record_count(handle, gfp_mask, 1 << order);
}

void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -872,11 +878,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
	return stack;
}

static unsigned long page_owner_stack_threshold;
static unsigned long page_owner_pages_threshold;

static int stack_print(struct seq_file *m, void *v)
{
	int i, stack_count;
	int i, nr_base_pages;
	struct stack *stack = v;
	unsigned long *entries;
	unsigned long nr_entries;
@@ -887,14 +893,14 @@ static int stack_print(struct seq_file *m, void *v)

	nr_entries = stack_record->size;
	entries = stack_record->entries;
	stack_count = refcount_read(&stack_record->count) - 1;
	nr_base_pages = refcount_read(&stack_record->count) - 1;

	if (stack_count < 1 || stack_count < page_owner_stack_threshold)
	if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
		return 0;

	for (i = 0; i < nr_entries; i++)
		seq_printf(m, " %pS\n", (void *)entries[i]);
	seq_printf(m, "stack_count: %d\n\n", stack_count);
	seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);

	return 0;
}
@@ -924,13 +930,13 @@ static const struct file_operations page_owner_stack_operations = {

static int page_owner_threshold_get(void *data, u64 *val)
{
	*val = READ_ONCE(page_owner_stack_threshold);
	*val = READ_ONCE(page_owner_pages_threshold);
	return 0;
}

static int page_owner_threshold_set(void *data, u64 val)
{
	WRITE_ONCE(page_owner_stack_threshold, val);
	WRITE_ONCE(page_owner_pages_threshold, val);
	return 0;
}