Commit 4b989955 authored by Baolin Wang's avatar Baolin Wang Committed by Andrew Morton
Browse files

mm: shmem: add multi-size THP sysfs interface for anonymous shmem

To support the use of mTHP with anonymous shmem, add a new sysfs interface
'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/'
directory for each mTHP to control whether shmem is enabled for that mTHP,
with a value similar to the top level 'shmem_enabled', which can be set
to: "always", "inherit (to inherit the top level setting)", "within_size",
"advise", "never".  An 'inherit' option is added to ensure compatibility
with these global settings, and the options 'force' and 'deny' are
dropped, which are rather testing artifacts from the old ages.

By default, PMD-sized hugepages have enabled="inherit" and all other
hugepage sizes have enabled="never" for
'/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'.

In addition, if top level value is 'force', then only PMD-sized hugepages
have enabled="inherit", otherwise configuration will be failed and vice
versa.  That means now we will avoid using non-PMD sized THP to override
the global huge allocation.

[baolin.wang@linux.alibaba.com: fix transhuge.rst indentation]
  Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com
[akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols]
[baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS]
  Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com
[akpm@linux-foundation.org: huge_memory.c needs mm_types.h]
Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com


Signed-off-by: default avatarBaolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 3d95bc21
Loading
Loading
Loading
Loading
+25 −0
Original line number Diff line number Diff line
@@ -332,6 +332,31 @@ deny
force
    Force the huge option on for all - very useful for testing;

Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to
control mTHP allocation:
'/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled',
and its value for each mTHP is essentially consistent with the global
setting.  An 'inherit' option is added to ensure compatibility with these
global settings.  Conversely, the options 'force' and 'deny' are dropped,
which are rather testing artifacts from the old ages.

always
    Attempt to allocate <size> huge pages every time we need a new page;

inherit
    Inherit the top-level "shmem_enabled" value. By default, PMD-sized hugepages
    have enabled="inherit" and all other hugepage sizes have enabled="never";

never
    Do not allocate <size> huge pages;

within_size
    Only allocate <size> huge page if it will be fully within i_size.
    Also respect fadvise()/madvise() hints;

advise
    Only allocate <size> huge pages if requested with fadvise()/madvise();

Need of application restart
===========================

+10 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf,
				  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;
extern struct kobj_attribute thpsize_shmem_enabled_attr;

/*
 * Mask of all large folio orders supported for anonymous THP; all orders up to
@@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}

struct thpsize {
	struct kobject kobj;
	struct list_head node;
	int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

enum mthp_stat_item {
	MTHP_STAT_ANON_FAULT_ALLOC,
	MTHP_STAT_ANON_FAULT_FALLBACK,
+4 −8
Original line number Diff line number Diff line
@@ -20,6 +20,7 @@
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
@@ -449,14 +450,6 @@ static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

struct thpsize {
	struct kobject kobj;
	struct list_head node;
	int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

static ssize_t thpsize_enabled_show(struct kobject *kobj,
				    struct kobj_attribute *attr, char *buf)
{
@@ -517,6 +510,9 @@ static struct kobj_attribute thpsize_enabled_attr =

static struct attribute *thpsize_attrs[] = {
	&thpsize_enabled_attr.attr,
#ifdef CONFIG_SHMEM
	&thpsize_shmem_enabled_attr.attr,
#endif
	NULL,
};

+96 −0
Original line number Diff line number Diff line
@@ -131,6 +131,13 @@ struct shmem_options {
#define SHMEM_SEEN_QUOTA 32
};

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static unsigned long huge_shmem_orders_always __read_mostly;
static unsigned long huge_shmem_orders_madvise __read_mostly;
static unsigned long huge_shmem_orders_inherit __read_mostly;
static unsigned long huge_shmem_orders_within_size __read_mostly;
#endif

#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
@@ -4672,6 +4679,12 @@ void __init shmem_init(void)
		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
	else
		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */

	/*
	 * Default to setting PMD-sized THP to inherit the global setting and
	 * disable all other multi-size THPs.
	 */
	huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
#endif
	return;

@@ -4731,6 +4744,11 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
		return -EINVAL;

	/* Do not override huge allocation policy with non-PMD sized mTHP */
	if (huge == SHMEM_HUGE_FORCE &&
	    huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
		return -EINVAL;

	shmem_huge = huge;
	if (shmem_huge > SHMEM_HUGE_DENY)
		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
@@ -4738,6 +4756,84 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
}

struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
static DEFINE_SPINLOCK(huge_shmem_orders_lock);

static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
					  struct kobj_attribute *attr, char *buf)
{
	int order = to_thpsize(kobj)->order;
	const char *output;

	if (test_bit(order, &huge_shmem_orders_always))
		output = "[always] inherit within_size advise never";
	else if (test_bit(order, &huge_shmem_orders_inherit))
		output = "always [inherit] within_size advise never";
	else if (test_bit(order, &huge_shmem_orders_within_size))
		output = "always inherit [within_size] advise never";
	else if (test_bit(order, &huge_shmem_orders_madvise))
		output = "always inherit within_size [advise] never";
	else
		output = "always inherit within_size advise [never]";

	return sysfs_emit(buf, "%s\n", output);
}

static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
					   struct kobj_attribute *attr,
					   const char *buf, size_t count)
{
	int order = to_thpsize(kobj)->order;
	ssize_t ret = count;

	if (sysfs_streq(buf, "always")) {
		spin_lock(&huge_shmem_orders_lock);
		clear_bit(order, &huge_shmem_orders_inherit);
		clear_bit(order, &huge_shmem_orders_madvise);
		clear_bit(order, &huge_shmem_orders_within_size);
		set_bit(order, &huge_shmem_orders_always);
		spin_unlock(&huge_shmem_orders_lock);
	} else if (sysfs_streq(buf, "inherit")) {
		/* Do not override huge allocation policy with non-PMD sized mTHP */
		if (shmem_huge == SHMEM_HUGE_FORCE &&
		    order != HPAGE_PMD_ORDER)
			return -EINVAL;

		spin_lock(&huge_shmem_orders_lock);
		clear_bit(order, &huge_shmem_orders_always);
		clear_bit(order, &huge_shmem_orders_madvise);
		clear_bit(order, &huge_shmem_orders_within_size);
		set_bit(order, &huge_shmem_orders_inherit);
		spin_unlock(&huge_shmem_orders_lock);
	} else if (sysfs_streq(buf, "within_size")) {
		spin_lock(&huge_shmem_orders_lock);
		clear_bit(order, &huge_shmem_orders_always);
		clear_bit(order, &huge_shmem_orders_inherit);
		clear_bit(order, &huge_shmem_orders_madvise);
		set_bit(order, &huge_shmem_orders_within_size);
		spin_unlock(&huge_shmem_orders_lock);
	} else if (sysfs_streq(buf, "madvise")) {
		spin_lock(&huge_shmem_orders_lock);
		clear_bit(order, &huge_shmem_orders_always);
		clear_bit(order, &huge_shmem_orders_inherit);
		clear_bit(order, &huge_shmem_orders_within_size);
		set_bit(order, &huge_shmem_orders_madvise);
		spin_unlock(&huge_shmem_orders_lock);
	} else if (sysfs_streq(buf, "never")) {
		spin_lock(&huge_shmem_orders_lock);
		clear_bit(order, &huge_shmem_orders_always);
		clear_bit(order, &huge_shmem_orders_inherit);
		clear_bit(order, &huge_shmem_orders_within_size);
		clear_bit(order, &huge_shmem_orders_madvise);
		spin_unlock(&huge_shmem_orders_lock);
	} else {
		ret = -EINVAL;
	}

	return ret;
}

struct kobj_attribute thpsize_shmem_enabled_attr =
	__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */

#else /* !CONFIG_SHMEM */