From efe89a30f70753d861340a20365812e93d34a0de Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 8 Aug 2025 14:46:43 +0200 Subject: [PATCH 01/21] s390/sclp: Refactor sclp_cmd.c To improve readability, refactor sclp_cmd.c: * Move defines and structures to the beginning. * Reverse x-mas tree usage. * Remove spaces after casting to eliminate checkpatch warnings. * Remove unnecessary comments. * Reframe certain comments. * Convert all unsigned long long to unsigned long since unsigned long long is a leftover from the 31/32-bit era. * Use correct format specifiers. * Add braces to for loops with bodies containing more than one line. * Sort header files and remove unnecessary ones. * Use __packed and __aligned instead of __attribute((packed, aligned(8))) in structures. Acked-by: Heiko Carstens Signed-off-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- drivers/s390/char/sclp_cmd.c | 179 ++++++++++++++++------------------- 1 file changed, 83 insertions(+), 96 deletions(-) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index 16469678548f..ee0884d99d88 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -8,31 +8,56 @@ #define KMSG_COMPONENT "sclp_cmd" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #include #include -#include -#include +#include +#include +#include +#include #include -#include -#include -#include -#include +#include #include +#include #include +#include #include "sclp.h" -#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 -#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 +#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 +#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 +/* CPU configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 +#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 +/* Channel path configuration related functions */ +#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 +#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 +#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 + +struct cpu_configure_sccb { + struct sccb_header header; +} __packed __aligned(8); + +struct chp_cfg_sccb { + struct sccb_header header; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; + +struct chp_info_sccb { + struct sccb_header header; + u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; + u8 standby[SCLP_CHP_INFO_MASK_SIZE]; + u8 configured[SCLP_CHP_INFO_MASK_SIZE]; + u8 ccm; + u8 reserved[6]; + u8 cssid; +} __packed; static void sclp_sync_callback(struct sclp_req *req, void *data) { @@ -64,13 +89,11 @@ int sclp_sync_request_timeout(sclp_cmdw_t cmd, void *sccb, int timeout) request->callback_data = &completion; init_completion(&completion); - /* Perform sclp request. */ rc = sclp_add_request(request); if (rc) goto out; wait_for_completion(&completion); - /* Check response. */ if (request->status != SCLP_REQ_DONE) { pr_warn("sync request failed (cmd=0x%08x, status=0x%02x)\n", cmd, request->status); @@ -81,22 +104,15 @@ out: return rc; } -/* - * CPU configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CPU 0x00110001 -#define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 - int _sclp_get_core_info(struct sclp_core_info *info) { - int rc; - int length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; struct read_cpu_info_sccb *sccb; + int rc, length; if (!SCLP_HAS_CPU_INFO) return -EOPNOTSUPP; + length = test_facility(140) ? EXT_SCCB_READ_CPU : PAGE_SIZE; sccb = (void *)__get_free_pages(GFP_KERNEL | GFP_DMA | __GFP_ZERO, get_order(length)); if (!sccb) return -ENOMEM; @@ -114,14 +130,10 @@ int _sclp_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - free_pages((unsigned long) sccb, get_order(length)); + free_pages((unsigned long)sccb, get_order(length)); return rc; } -struct cpu_configure_sccb { - struct sccb_header header; -} __attribute__((packed, aligned(8))); - static int do_core_configure(sclp_cmdw_t cmd) { struct cpu_configure_sccb *sccb; @@ -130,8 +142,8 @@ static int do_core_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CPU_RECONFIG) return -EOPNOTSUPP; /* - * This is not going to cross a page boundary since we force - * kmalloc to have a minimum alignment of 8 bytes on s390. + * Use kmalloc to have a minimum alignment of 8 bytes and ensure sccb + * is not going to cross a page boundary. */ sccb = kzalloc(sizeof(*sccb), GFP_KERNEL | GFP_DMA); if (!sccb) @@ -183,6 +195,14 @@ struct assign_storage_sccb { u16 rn; } __packed; +struct attach_storage_sccb { + struct sccb_header header; + u16 :16; + u16 assigned; + u32 :32; + u32 entries[]; +} __packed; + int arch_get_memory_phys_device(unsigned long start_pfn) { if (!sclp.rzm) @@ -190,9 +210,9 @@ int arch_get_memory_phys_device(unsigned long start_pfn) return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); } -static unsigned long long rn2addr(u16 rn) +static unsigned long rn2addr(u16 rn) { - return (unsigned long long) (rn - 1) * sclp.rzm; + return (unsigned long)(rn - 1) * sclp.rzm; } static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) @@ -200,7 +220,7 @@ static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) struct assign_storage_sccb *sccb; int rc; - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; @@ -219,13 +239,13 @@ static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } static int sclp_assign_storage(u16 rn) { - unsigned long long start; + unsigned long start; int rc; rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); @@ -241,21 +261,12 @@ static int sclp_unassign_storage(u16 rn) return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); } -struct attach_storage_sccb { - struct sccb_header header; - u16 :16; - u16 assigned; - u32 :32; - u32 entries[]; -} __packed; - static int sclp_attach_storage(u8 id) { struct attach_storage_sccb *sccb; - int rc; - int i; + int rc, i; - sccb = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; @@ -277,7 +288,7 @@ static int sclp_attach_storage(u8 id) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } @@ -285,7 +296,7 @@ static int sclp_mem_change_state(unsigned long start, unsigned long size, int online) { struct memory_increment *incr; - unsigned long long istart; + unsigned long istart; int rc = 0; list_for_each_entry(incr, &sclp_mem_list, list) { @@ -338,7 +349,7 @@ static int sclp_mem_notifier(struct notifier_block *nb, switch (action) { case MEM_GOING_OFFLINE: /* - * We do not allow to set memory blocks offline that contain + * Do not allow to set memory blocks offline that contain * standby memory. This is done to simplify the "memory online" * case. */ @@ -390,16 +401,16 @@ static struct notifier_block sclp_mem_nb = { .notifier_call = sclp_mem_notifier, }; -static void __init align_to_block_size(unsigned long long *start, - unsigned long long *size, - unsigned long long alignment) +static void __init align_to_block_size(unsigned long *start, + unsigned long *size, + unsigned long alignment) { - unsigned long long start_align, size_align; + unsigned long start_align, size_align; start_align = roundup(*start, alignment); size_align = rounddown(*start + *size, alignment) - start_align; - pr_info("Standby memory at 0x%llx (%lluM of %lluM usable)\n", + pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", *start, size_align >> 20, *size >> 20); *start = start_align; *size = size_align; @@ -407,7 +418,7 @@ static void __init align_to_block_size(unsigned long long *start, static void __init add_memory_merged(u16 rn) { - unsigned long long start, size, addr, block_size; + unsigned long start, size, addr, block_size; static u16 first_rn, num; if (rn && first_rn && (first_rn + num == rn)) { @@ -417,7 +428,7 @@ static void __init add_memory_merged(u16 rn) if (!first_rn) goto skip_add; start = rn2addr(first_rn); - size = (unsigned long long) num * sclp.rzm; + size = (unsigned long)num * sclp.rzm; if (start >= ident_map_size) goto skip_add; if (start + size > ident_map_size) @@ -426,10 +437,11 @@ static void __init add_memory_merged(u16 rn) align_to_block_size(&start, &size, block_size); if (!size) goto skip_add; - for (addr = start; addr < start + size; addr += block_size) + for (addr = start; addr < start + size; addr += block_size) { add_memory(0, addr, block_size, cpu_has_edat1() ? MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); + } skip_add: first_rn = rn; num = 1; @@ -439,9 +451,10 @@ static void __init sclp_add_standby_memory(void) { struct memory_increment *incr; - list_for_each_entry(incr, &sclp_mem_list, list) + list_for_each_entry(incr, &sclp_mem_list, list) { if (incr->standby) add_memory_merged(incr->rn); + } add_memory_merged(0); } @@ -480,12 +493,13 @@ static int __init sclp_detect_standby_memory(void) struct read_storage_sccb *sccb; int i, id, assigned, rc; - if (oldmem_data.start) /* No standby memory in kdump mode */ + /* No standby memory in kdump mode */ + if (oldmem_data.start) return 0; - if ((sclp.facilities & 0xe00000000000ULL) != 0xe00000000000ULL) + if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) return 0; rc = -ENOMEM; - sccb = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); + sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); if (!sccb) goto out; assigned = 0; @@ -531,28 +545,13 @@ static int __init sclp_detect_standby_memory(void) goto out; sclp_add_standby_memory(); out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } __initcall(sclp_detect_standby_memory); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * Channel path configuration related functions. - */ - -#define SCLP_CMDW_CONFIGURE_CHPATH 0x000f0001 -#define SCLP_CMDW_DECONFIGURE_CHPATH 0x000e0001 -#define SCLP_CMDW_READ_CHPATH_INFORMATION 0x00030001 - -struct chp_cfg_sccb { - struct sccb_header header; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - static int do_chp_configure(sclp_cmdw_t cmd) { struct chp_cfg_sccb *sccb; @@ -560,8 +559,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) if (!SCLP_HAS_CHP_RECONFIG) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_cfg_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_cfg_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -581,7 +579,7 @@ static int do_chp_configure(sclp_cmdw_t cmd) break; } out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } @@ -609,16 +607,6 @@ int sclp_chp_deconfigure(struct chp_id chpid) return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8); } -struct chp_info_sccb { - struct sccb_header header; - u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; - u8 standby[SCLP_CHP_INFO_MASK_SIZE]; - u8 configured[SCLP_CHP_INFO_MASK_SIZE]; - u8 ccm; - u8 reserved[6]; - u8 cssid; -} __attribute__((packed)); - /** * sclp_chp_read_info - perform read channel-path information sclp command * @info: resulting channel-path information data @@ -634,8 +622,7 @@ int sclp_chp_read_info(struct sclp_chp_info *info) if (!SCLP_HAS_CHP_INFO) return -EOPNOTSUPP; - /* Prepare sccb. */ - sccb = (struct chp_info_sccb *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + sccb = (struct chp_info_sccb *)get_zeroed_page(GFP_KERNEL | GFP_DMA); if (!sccb) return -ENOMEM; sccb->header.length = sizeof(*sccb); @@ -652,6 +639,6 @@ int sclp_chp_read_info(struct sclp_chp_info *info) memcpy(info->standby, sccb->standby, SCLP_CHP_INFO_MASK_SIZE); memcpy(info->configured, sccb->configured, SCLP_CHP_INFO_MASK_SIZE); out: - free_page((unsigned long) sccb); + free_page((unsigned long)sccb); return rc; } From f9de6cdf4cf8c932ee94f6e25cd7434a97c78bf3 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 8 Aug 2025 14:46:44 +0200 Subject: [PATCH 02/21] s390/sclp: Move memory hotplug code for better modularity To improve readability and prepare for future extensions, move the memory hotplug code to a separate file. In addtion to it, add required headers in sclp_mem.c and remove unnecessary headers in sclp_cmd.c Acked-by: Heiko Carstens Signed-off-by: Sumanth Korikkar Signed-off-by: Alexander Gordeev --- drivers/s390/char/Makefile | 1 + drivers/s390/char/sclp_cmd.c | 385 --------------------------------- drivers/s390/char/sclp_mem.c | 399 +++++++++++++++++++++++++++++++++++ 3 files changed, 400 insertions(+), 385 deletions(-) create mode 100644 drivers/s390/char/sclp_mem.c diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile index 81d6744e1861..dcbd51152ee3 100644 --- a/drivers/s390/char/Makefile +++ b/drivers/s390/char/Makefile @@ -21,6 +21,7 @@ obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \ sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \ sclp_early.o sclp_early_core.o sclp_sd.o +obj-$(CONFIG_MEMORY_HOTPLUG) += sclp_mem.o obj-$(CONFIG_TN3270) += raw3270.o con3270.o obj-$(CONFIG_TN3270_FS) += fs3270.o diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index ee0884d99d88..3480198eac02 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -9,27 +9,17 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include -#include #include #include #include -#include -#include -#include -#include #include #include #include #include -#include -#include -#include #include #include "sclp.h" -#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 -#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 /* CPU configuration related functions */ #define SCLP_CMDW_CONFIGURE_CPU 0x00110001 #define SCLP_CMDW_DECONFIGURE_CPU 0x00100001 @@ -177,381 +167,6 @@ int sclp_core_deconfigure(u8 core) return do_core_configure(SCLP_CMDW_DECONFIGURE_CPU | core << 8); } -#ifdef CONFIG_MEMORY_HOTPLUG - -static DEFINE_MUTEX(sclp_mem_mutex); -static LIST_HEAD(sclp_mem_list); -static u8 sclp_max_storage_id; -static DECLARE_BITMAP(sclp_storage_ids, 256); - -struct memory_increment { - struct list_head list; - u16 rn; - int standby; -}; - -struct assign_storage_sccb { - struct sccb_header header; - u16 rn; -} __packed; - -struct attach_storage_sccb { - struct sccb_header header; - u16 :16; - u16 assigned; - u32 :32; - u32 entries[]; -} __packed; - -int arch_get_memory_phys_device(unsigned long start_pfn) -{ - if (!sclp.rzm) - return 0; - return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); -} - -static unsigned long rn2addr(u16 rn) -{ - return (unsigned long)(rn - 1) * sclp.rzm; -} - -static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) -{ - struct assign_storage_sccb *sccb; - int rc; - - sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->rn = rn; - rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - case 0x0120: - break; - default: - pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", - cmd, sccb->header.response_code, rn); - rc = -EIO; - break; - } -out: - free_page((unsigned long)sccb); - return rc; -} - -static int sclp_assign_storage(u16 rn) -{ - unsigned long start; - int rc; - - rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); - if (rc) - return rc; - start = rn2addr(rn); - storage_key_init_range(start, start + sclp.rzm); - return 0; -} - -static int sclp_unassign_storage(u16 rn) -{ - return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); -} - -static int sclp_attach_storage(u8 id) -{ - struct attach_storage_sccb *sccb; - int rc, i; - - sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - return -ENOMEM; - sccb->header.length = PAGE_SIZE; - sccb->header.function_code = 0x40; - rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, - SCLP_QUEUE_INTERVAL); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0020: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (sccb->entries[i]) - sclp_unassign_storage(sccb->entries[i] >> 16); - } - break; - default: - rc = -EIO; - break; - } -out: - free_page((unsigned long)sccb); - return rc; -} - -static int sclp_mem_change_state(unsigned long start, unsigned long size, - int online) -{ - struct memory_increment *incr; - unsigned long istart; - int rc = 0; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (start + size - 1 < istart) - break; - if (start > istart + sclp.rzm - 1) - continue; - if (online) - rc |= sclp_assign_storage(incr->rn); - else - sclp_unassign_storage(incr->rn); - if (rc == 0) - incr->standby = online ? 0 : 1; - } - return rc ? -EIO : 0; -} - -static bool contains_standby_increment(unsigned long start, unsigned long end) -{ - struct memory_increment *incr; - unsigned long istart; - - list_for_each_entry(incr, &sclp_mem_list, list) { - istart = rn2addr(incr->rn); - if (end - 1 < istart) - continue; - if (start > istart + sclp.rzm - 1) - continue; - if (incr->standby) - return true; - } - return false; -} - -static int sclp_mem_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - unsigned long start, size; - struct memory_notify *arg; - unsigned char id; - int rc = 0; - - arg = data; - start = arg->start_pfn << PAGE_SHIFT; - size = arg->nr_pages << PAGE_SHIFT; - mutex_lock(&sclp_mem_mutex); - for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) - sclp_attach_storage(id); - switch (action) { - case MEM_GOING_OFFLINE: - /* - * Do not allow to set memory blocks offline that contain - * standby memory. This is done to simplify the "memory online" - * case. - */ - if (contains_standby_increment(start, start + size)) - rc = -EPERM; - break; - case MEM_PREPARE_ONLINE: - /* - * Access the altmap_start_pfn and altmap_nr_pages fields - * within the struct memory_notify specifically when dealing - * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - * - * When altmap is in use, take the specified memory range - * online, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - rc = sclp_mem_change_state(start, size, 1); - if (rc || !arg->altmap_nr_pages) - break; - /* - * Set CMMA state to nodat here, since the struct page memory - * at the beginning of the memory block will not go through the - * buddy allocator later. - */ - __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); - break; - case MEM_FINISH_OFFLINE: - /* - * When altmap is in use, take the specified memory range - * offline, which includes the altmap. - */ - if (arg->altmap_nr_pages) { - start = PFN_PHYS(arg->altmap_start_pfn); - size += PFN_PHYS(arg->altmap_nr_pages); - } - sclp_mem_change_state(start, size, 0); - break; - default: - break; - } - mutex_unlock(&sclp_mem_mutex); - return rc ? NOTIFY_BAD : NOTIFY_OK; -} - -static struct notifier_block sclp_mem_nb = { - .notifier_call = sclp_mem_notifier, -}; - -static void __init align_to_block_size(unsigned long *start, - unsigned long *size, - unsigned long alignment) -{ - unsigned long start_align, size_align; - - start_align = roundup(*start, alignment); - size_align = rounddown(*start + *size, alignment) - start_align; - - pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", - *start, size_align >> 20, *size >> 20); - *start = start_align; - *size = size_align; -} - -static void __init add_memory_merged(u16 rn) -{ - unsigned long start, size, addr, block_size; - static u16 first_rn, num; - - if (rn && first_rn && (first_rn + num == rn)) { - num++; - return; - } - if (!first_rn) - goto skip_add; - start = rn2addr(first_rn); - size = (unsigned long)num * sclp.rzm; - if (start >= ident_map_size) - goto skip_add; - if (start + size > ident_map_size) - size = ident_map_size - start; - block_size = memory_block_size_bytes(); - align_to_block_size(&start, &size, block_size); - if (!size) - goto skip_add; - for (addr = start; addr < start + size; addr += block_size) { - add_memory(0, addr, block_size, - cpu_has_edat1() ? - MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); - } -skip_add: - first_rn = rn; - num = 1; -} - -static void __init sclp_add_standby_memory(void) -{ - struct memory_increment *incr; - - list_for_each_entry(incr, &sclp_mem_list, list) { - if (incr->standby) - add_memory_merged(incr->rn); - } - add_memory_merged(0); -} - -static void __init insert_increment(u16 rn, int standby, int assigned) -{ - struct memory_increment *incr, *new_incr; - struct list_head *prev; - u16 last_rn; - - new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); - if (!new_incr) - return; - new_incr->rn = rn; - new_incr->standby = standby; - last_rn = 0; - prev = &sclp_mem_list; - list_for_each_entry(incr, &sclp_mem_list, list) { - if (assigned && incr->rn > rn) - break; - if (!assigned && incr->rn - last_rn > 1) - break; - last_rn = incr->rn; - prev = &incr->list; - } - if (!assigned) - new_incr->rn = last_rn + 1; - if (new_incr->rn > sclp.rnmax) { - kfree(new_incr); - return; - } - list_add(&new_incr->list, prev); -} - -static int __init sclp_detect_standby_memory(void) -{ - struct read_storage_sccb *sccb; - int i, id, assigned, rc; - - /* No standby memory in kdump mode */ - if (oldmem_data.start) - return 0; - if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) - return 0; - rc = -ENOMEM; - sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); - if (!sccb) - goto out; - assigned = 0; - for (id = 0; id <= sclp_max_storage_id; id++) { - memset(sccb, 0, PAGE_SIZE); - sccb->header.length = PAGE_SIZE; - rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); - if (rc) - goto out; - switch (sccb->header.response_code) { - case 0x0010: - set_bit(id, sclp_storage_ids); - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 0, 1); - } - break; - case 0x0310: - break; - case 0x0410: - for (i = 0; i < sccb->assigned; i++) { - if (!sccb->entries[i]) - continue; - assigned++; - insert_increment(sccb->entries[i] >> 16, 1, 1); - } - break; - default: - rc = -EIO; - break; - } - if (!rc) - sclp_max_storage_id = sccb->max_id; - } - if (rc || list_empty(&sclp_mem_list)) - goto out; - for (i = 1; i <= sclp.rnmax - assigned; i++) - insert_increment(0, 1, 0); - rc = register_memory_notifier(&sclp_mem_nb); - if (rc) - goto out; - sclp_add_standby_memory(); -out: - free_page((unsigned long)sccb); - return rc; -} -__initcall(sclp_detect_standby_memory); - -#endif /* CONFIG_MEMORY_HOTPLUG */ - static int do_chp_configure(sclp_cmdw_t cmd) { struct chp_cfg_sccb *sccb; diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c new file mode 100644 index 000000000000..27f49f5fd358 --- /dev/null +++ b/drivers/s390/char/sclp_mem.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory hotplug support via sclp + * + * Copyright IBM Corp. 2025 + */ + +#define KMSG_COMPONENT "sclp_mem" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sclp.h" + +#define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001 +#define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001 + +static DEFINE_MUTEX(sclp_mem_mutex); +static LIST_HEAD(sclp_mem_list); +static u8 sclp_max_storage_id; +static DECLARE_BITMAP(sclp_storage_ids, 256); + +struct memory_increment { + struct list_head list; + u16 rn; + int standby; +}; + +struct assign_storage_sccb { + struct sccb_header header; + u16 rn; +} __packed; + +struct attach_storage_sccb { + struct sccb_header header; + u16 :16; + u16 assigned; + u32 :32; + u32 entries[]; +} __packed; + +int arch_get_memory_phys_device(unsigned long start_pfn) +{ + if (!sclp.rzm) + return 0; + return PFN_PHYS(start_pfn) >> ilog2(sclp.rzm); +} + +static unsigned long rn2addr(u16 rn) +{ + return (unsigned long)(rn - 1) * sclp.rzm; +} + +static int do_assign_storage(sclp_cmdw_t cmd, u16 rn) +{ + struct assign_storage_sccb *sccb; + int rc; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->rn = rn; + rc = sclp_sync_request_timeout(cmd, sccb, SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + case 0x0120: + break; + default: + pr_warn("assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n", + cmd, sccb->header.response_code, rn); + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_assign_storage(u16 rn) +{ + unsigned long start; + int rc; + + rc = do_assign_storage(SCLP_CMDW_ASSIGN_STORAGE, rn); + if (rc) + return rc; + start = rn2addr(rn); + storage_key_init_range(start, start + sclp.rzm); + return 0; +} + +static int sclp_unassign_storage(u16 rn) +{ + return do_assign_storage(SCLP_CMDW_UNASSIGN_STORAGE, rn); +} + +static int sclp_attach_storage(u8 id) +{ + struct attach_storage_sccb *sccb; + int rc, i; + + sccb = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + return -ENOMEM; + sccb->header.length = PAGE_SIZE; + sccb->header.function_code = 0x40; + rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, + SCLP_QUEUE_INTERVAL); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0020: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (sccb->entries[i]) + sclp_unassign_storage(sccb->entries[i] >> 16); + } + break; + default: + rc = -EIO; + break; + } +out: + free_page((unsigned long)sccb); + return rc; +} + +static int sclp_mem_change_state(unsigned long start, unsigned long size, + int online) +{ + struct memory_increment *incr; + unsigned long istart; + int rc = 0; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (start + size - 1 < istart) + break; + if (start > istart + sclp.rzm - 1) + continue; + if (online) + rc |= sclp_assign_storage(incr->rn); + else + sclp_unassign_storage(incr->rn); + if (rc == 0) + incr->standby = online ? 0 : 1; + } + return rc ? -EIO : 0; +} + +static bool contains_standby_increment(unsigned long start, unsigned long end) +{ + struct memory_increment *incr; + unsigned long istart; + + list_for_each_entry(incr, &sclp_mem_list, list) { + istart = rn2addr(incr->rn); + if (end - 1 < istart) + continue; + if (start > istart + sclp.rzm - 1) + continue; + if (incr->standby) + return true; + } + return false; +} + +static int sclp_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long start, size; + struct memory_notify *arg; + unsigned char id; + int rc = 0; + + arg = data; + start = arg->start_pfn << PAGE_SHIFT; + size = arg->nr_pages << PAGE_SHIFT; + mutex_lock(&sclp_mem_mutex); + for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1) + sclp_attach_storage(id); + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Do not allow to set memory blocks offline that contain + * standby memory. This is done to simplify the "memory online" + * case. + */ + if (contains_standby_increment(start, start + size)) + rc = -EPERM; + break; + case MEM_PREPARE_ONLINE: + /* + * Access the altmap_start_pfn and altmap_nr_pages fields + * within the struct memory_notify specifically when dealing + * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. + * + * When altmap is in use, take the specified memory range + * online, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + rc = sclp_mem_change_state(start, size, 1); + if (rc || !arg->altmap_nr_pages) + break; + /* + * Set CMMA state to nodat here, since the struct page memory + * at the beginning of the memory block will not go through the + * buddy allocator later. + */ + __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages); + break; + case MEM_FINISH_OFFLINE: + /* + * When altmap is in use, take the specified memory range + * offline, which includes the altmap. + */ + if (arg->altmap_nr_pages) { + start = PFN_PHYS(arg->altmap_start_pfn); + size += PFN_PHYS(arg->altmap_nr_pages); + } + sclp_mem_change_state(start, size, 0); + break; + default: + break; + } + mutex_unlock(&sclp_mem_mutex); + return rc ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block sclp_mem_nb = { + .notifier_call = sclp_mem_notifier, +}; + +static void __init align_to_block_size(unsigned long *start, + unsigned long *size, + unsigned long alignment) +{ + unsigned long start_align, size_align; + + start_align = roundup(*start, alignment); + size_align = rounddown(*start + *size, alignment) - start_align; + + pr_info("Standby memory at 0x%lx (%luM of %luM usable)\n", + *start, size_align >> 20, *size >> 20); + *start = start_align; + *size = size_align; +} + +static void __init add_memory_merged(u16 rn) +{ + unsigned long start, size, addr, block_size; + static u16 first_rn, num; + + if (rn && first_rn && (first_rn + num == rn)) { + num++; + return; + } + if (!first_rn) + goto skip_add; + start = rn2addr(first_rn); + size = (unsigned long)num * sclp.rzm; + if (start >= ident_map_size) + goto skip_add; + if (start + size > ident_map_size) + size = ident_map_size - start; + block_size = memory_block_size_bytes(); + align_to_block_size(&start, &size, block_size); + if (!size) + goto skip_add; + for (addr = start; addr < start + size; addr += block_size) { + add_memory(0, addr, block_size, + cpu_has_edat1() ? + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE); + } +skip_add: + first_rn = rn; + num = 1; +} + +static void __init sclp_add_standby_memory(void) +{ + struct memory_increment *incr; + + list_for_each_entry(incr, &sclp_mem_list, list) { + if (incr->standby) + add_memory_merged(incr->rn); + } + add_memory_merged(0); +} + +static void __init insert_increment(u16 rn, int standby, int assigned) +{ + struct memory_increment *incr, *new_incr; + struct list_head *prev; + u16 last_rn; + + new_incr = kzalloc(sizeof(*new_incr), GFP_KERNEL); + if (!new_incr) + return; + new_incr->rn = rn; + new_incr->standby = standby; + last_rn = 0; + prev = &sclp_mem_list; + list_for_each_entry(incr, &sclp_mem_list, list) { + if (assigned && incr->rn > rn) + break; + if (!assigned && incr->rn - last_rn > 1) + break; + last_rn = incr->rn; + prev = &incr->list; + } + if (!assigned) + new_incr->rn = last_rn + 1; + if (new_incr->rn > sclp.rnmax) { + kfree(new_incr); + return; + } + list_add(&new_incr->list, prev); +} + +static int __init sclp_detect_standby_memory(void) +{ + struct read_storage_sccb *sccb; + int i, id, assigned, rc; + + /* No standby memory in kdump mode */ + if (oldmem_data.start) + return 0; + if ((sclp.facilities & 0xe00000000000UL) != 0xe00000000000UL) + return 0; + rc = -ENOMEM; + sccb = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!sccb) + goto out; + assigned = 0; + for (id = 0; id <= sclp_max_storage_id; id++) { + memset(sccb, 0, PAGE_SIZE); + sccb->header.length = PAGE_SIZE; + rc = sclp_sync_request(SCLP_CMDW_READ_STORAGE_INFO | id << 8, sccb); + if (rc) + goto out; + switch (sccb->header.response_code) { + case 0x0010: + set_bit(id, sclp_storage_ids); + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 0, 1); + } + break; + case 0x0310: + break; + case 0x0410: + for (i = 0; i < sccb->assigned; i++) { + if (!sccb->entries[i]) + continue; + assigned++; + insert_increment(sccb->entries[i] >> 16, 1, 1); + } + break; + default: + rc = -EIO; + break; + } + if (!rc) + sclp_max_storage_id = sccb->max_id; + } + if (rc || list_empty(&sclp_mem_list)) + goto out; + for (i = 1; i <= sclp.rnmax - assigned; i++) + insert_increment(0, 1, 0); + rc = register_memory_notifier(&sclp_mem_nb); + if (rc) + goto out; + sclp_add_standby_memory(); +out: + free_page((unsigned long)sccb); + return rc; +} +__initcall(sclp_detect_standby_memory); From de88e74889a30bd9ff4047726021cde857348b4b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 4 Aug 2025 17:08:54 +0200 Subject: [PATCH 03/21] s390/bitops: Slightly optimize ffs() and fls64() Use a simpler algorithm to calculate the result of ffs() and fls64(). This generates slightly better code and increases readability. Kernel image size is reduced by ~3kb (gcc 15.1.0 + defconfig). Suggested-by: Nina Schoetterl-Glausch Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index a5ca0a947691..10d7573d1582 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -193,10 +193,9 @@ static inline unsigned long __ffs(unsigned long word) */ static inline int ffs(int word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; unsigned int val = (unsigned int)word; - return (1 + (__flogr(-val & val) ^ (BITS_PER_LONG - 1))) & mask; + return BITS_PER_LONG - __flogr(-val & val); } /** @@ -223,9 +222,7 @@ static inline unsigned long __fls(unsigned long word) */ static inline int fls64(unsigned long word) { - unsigned long mask = 2 * BITS_PER_LONG - 1; - - return (1 + (__flogr(word) ^ (BITS_PER_LONG - 1))) & mask; + return BITS_PER_LONG - __flogr(word); } /** From 669bc57e7016cf9d1a9eedb2a984c4fb4fd67f3d Mon Sep 17 00:00:00 2001 From: Juergen Christ Date: Mon, 11 Aug 2025 17:22:53 +0200 Subject: [PATCH 04/21] s390/bitops: Optimize inlining GCC inlining heuristics prevent code growth due to inlining into cold paths. This causes GCC to emit a partially specialized version of __flogr for non-constant input for all occurrences on cold paths. This happens since the overhead seen during inlining includes setting up a union register_pair, calling flogr, and extracting and casting the result. This overhead is not removed until the function is lowered into RTL. But this happens after inlining. For -ftrivial-var-auto-init=zero builds, an additional initialization of the union register_pair adds another statement to be inlinined. This is unneeded since the even register is initialized anyway and the odd register is not an input register. It is only marked as such since the whole pair has to be marked as a read/write output register. Mark the union register_pair as uninitialized to get rid of this statement. This, however, does not change the code since the initialization happens when part of the register pair is written. Nevertheless, GCC function size approximation during inlining is reduced by one statement. Force inlining of flogr and also flatten some other functions that should be leaf functions but are called in cold context, like, e.g., __init functions. Acked-by: Heiko Carstens Signed-off-by: Juergen Christ Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 10d7573d1582..9dfb687ba620 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -130,7 +130,7 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static inline unsigned char __flogr(unsigned long word) +static __always_inline unsigned char __flogr(unsigned long word) { if (__builtin_constant_p(word)) { unsigned long bit = 0; @@ -163,7 +163,7 @@ static inline unsigned char __flogr(unsigned long word) } return bit; } else { - union register_pair rp; + union register_pair rp __uninitialized; rp.even = word; asm volatile( @@ -179,7 +179,7 @@ static inline unsigned char __flogr(unsigned long word) * * Undefined if no bit exists, so code should check against 0 first. */ -static inline unsigned long __ffs(unsigned long word) +static __always_inline __flatten unsigned long __ffs(unsigned long word) { return __flogr(-word & word) ^ (BITS_PER_LONG - 1); } @@ -191,7 +191,7 @@ static inline unsigned long __ffs(unsigned long word) * This is defined the same way as the libc and * compiler builtin ffs routines (man ffs). */ -static inline int ffs(int word) +static __always_inline __flatten int ffs(int word) { unsigned int val = (unsigned int)word; @@ -204,7 +204,7 @@ static inline int ffs(int word) * * Undefined if no set bit exists, so code should check against 0 first. */ -static inline unsigned long __fls(unsigned long word) +static __always_inline __flatten unsigned long __fls(unsigned long word) { return __flogr(word) ^ (BITS_PER_LONG - 1); } @@ -220,7 +220,7 @@ static inline unsigned long __fls(unsigned long word) * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ -static inline int fls64(unsigned long word) +static __always_inline __flatten int fls64(unsigned long word) { return BITS_PER_LONG - __flogr(word); } @@ -232,7 +232,7 @@ static inline int fls64(unsigned long word) * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static inline int fls(unsigned int word) +static __always_inline __flatten int fls(unsigned int word) { return fls64(word); } From b3597eb51aad4a6e985c701c129bd7fc2cf0d682 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Aug 2025 16:58:43 +0200 Subject: [PATCH 05/21] s390/boot: Add common boot_panic() code Introduce a common boot_panic() helper macro, and use it to get rid of three more or less identical implementations. Signed-off-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/boot/boot.h | 8 ++++++++ arch/s390/boot/decompressor.c | 4 +--- arch/s390/boot/physmem_info.c | 4 +--- arch/s390/boot/startup.c | 13 +++---------- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index c0152db285f0..37d5b097ede5 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -10,6 +10,7 @@ #include #include +#include struct vmlinux_info { unsigned long entry; @@ -89,6 +90,13 @@ void __noreturn jump_to_kernel(psw_t *psw); #define boot_info(fmt, ...) boot_printk(KERN_INFO boot_fmt(fmt), ##__VA_ARGS__) #define boot_debug(fmt, ...) boot_printk(KERN_DEBUG boot_fmt(fmt), ##__VA_ARGS__) +#define boot_panic(...) do { \ + boot_emerg(__VA_ARGS__); \ + print_stacktrace(current_frame_address()); \ + boot_emerg(" -- System halted\n"); \ + disabled_wait(); \ +} while (0) + extern struct machine_info machine; extern int boot_console_loglevel; extern bool boot_ignore_loglevel; diff --git a/arch/s390/boot/decompressor.c b/arch/s390/boot/decompressor.c index 03500b9d9fb9..8d1bc25a6bf4 100644 --- a/arch/s390/boot/decompressor.c +++ b/arch/s390/boot/decompressor.c @@ -68,9 +68,7 @@ static void decompress_error(char *m) { if (bootdebug) boot_rb_dump(); - boot_emerg("Decompression error: %s\n", m); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Decompression error: %s\n", m); } unsigned long mem_safe_offset(void) diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c index 45e3d057cfaa..1f2ca5435838 100644 --- a/arch/s390/boot/physmem_info.c +++ b/arch/s390/boot/physmem_info.c @@ -228,9 +228,7 @@ static void die_oom(unsigned long size, unsigned long align, unsigned long min, boot_emerg("Usable online memory total: %lu Reserved: %lu Free: %lu\n", total_mem, total_reserved_mem, total_mem > total_reserved_mem ? total_mem - total_reserved_mem : 0); - print_stacktrace(current_frame_address()); - boot_emerg(" -- System halted\n"); - disabled_wait(); + boot_panic("Oom\n"); } static void _physmem_reserve(enum reserved_range_type type, unsigned long addr, unsigned long size) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 93684a775716..3fbd25b9498f 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -44,13 +44,6 @@ u64 __bootdata_preserved(clock_comparator_max) = -1UL; u64 __bootdata_preserved(stfle_fac_list[16]); struct oldmem_data __bootdata_preserved(oldmem_data); -void error(char *x) -{ - boot_emerg("%s\n", x); - boot_emerg(" -- System halted\n"); - disabled_wait(); -} - static char sysinfo_page[PAGE_SIZE] __aligned(PAGE_SIZE); static void detect_machine_type(void) @@ -220,10 +213,10 @@ static void rescue_initrd(unsigned long min, unsigned long max) static void copy_bootdata(void) { if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) - error(".boot.data section size mismatch"); + boot_panic(".boot.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) - error(".boot.preserved.data section size mismatch"); + boot_panic(".boot.preserved.data section size mismatch\n"); memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } @@ -237,7 +230,7 @@ static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, for (reloc = (int *)__vmlinux_relocs_64_start; reloc < (int *)__vmlinux_relocs_64_end; reloc++) { loc = (long)*reloc + phys_offset; if (loc < min_addr || loc > max_addr) - error("64-bit relocation outside of kernel!\n"); + boot_panic("64-bit relocation outside of kernel!\n"); *(u64 *)loc += offset; } } From 11aa54ba4cfa5390ea47c9a1fc62502abce1f6b9 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 13 Aug 2025 11:43:50 +0200 Subject: [PATCH 06/21] s390/pkey: Forward keygenflags to ep11_unwrapkey The pkey ioctl PKEY_CLR2SECK2 describes in the pkey.h header file the parameter 'keygenflags' which is forwarded to the handler functions which actually deal with the clear key to secure key operation. The ep11 handler module function ep11_clr2keyblob() function receives this parameter but does not forward it to the underlying function ep11_unwrapkey() on invocation. So in the end the user of this ioctl could not forward additional key generation flags to the ep11 implementation and thus was unable to modify the key generation process in any way. So now call ep11_unwrapkey() with the real keygenflags instead of 0 and thus the user of this ioctl can for example via keygenflags provide valid combinations of XCP_BLOB_* flags. Suggested-by: Ingo Franzki Signed-off-by: Harald Freudenberger Reviewed-by: Ingo Franzki Signed-off-by: Alexander Gordeev --- drivers/s390/crypto/zcrypt_ep11misc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index 3bf09a89a089..e92e2fd8ce5d 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -1405,7 +1405,9 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, /* Step 3: import the encrypted key value as a new key */ rc = ep11_unwrapkey(card, domain, kek, keklen, encbuf, encbuflen, 0, def_iv, - keybitsize, 0, keybuf, keybufsize, keytype, xflags); + keybitsize, keygenflags, + keybuf, keybufsize, + keytype, xflags); if (rc) { ZCRYPT_DBF_ERR("%s importing key value as new key failed, rc=%d\n", __func__, rc); From f5507aefc9114ced49d1ee527f63ea12ff5d7751 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 4 Sep 2025 13:40:29 +0200 Subject: [PATCH 07/21] s390/debug: Replace kmalloc() + copy_from_user() with memdup_user_nul() Replace kmalloc() followed by copy_from_user() with memdup_user_nul() to improve and simplify debug_get_user_string(). Remove the manual NUL-termination. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Niklas Schnelle Signed-off-by: Alexander Gordeev --- arch/s390/kernel/debug.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c62100dc62c8..6a26f202441d 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1416,18 +1416,12 @@ static inline char *debug_get_user_string(const char __user *user_buf, { char *buffer; - buffer = kmalloc(user_len + 1, GFP_KERNEL); - if (!buffer) - return ERR_PTR(-ENOMEM); - if (copy_from_user(buffer, user_buf, user_len) != 0) { - kfree(buffer); - return ERR_PTR(-EFAULT); - } + buffer = memdup_user_nul(user_buf, user_len); + if (IS_ERR(buffer)) + return buffer; /* got the string, now strip linefeed. */ if (buffer[user_len - 1] == '\n') buffer[user_len - 1] = 0; - else - buffer[user_len] = 0; return buffer; } From 5450abb0dea4f9fb432dea2ca92ea7a9bd25650b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 5 Sep 2025 13:02:23 +0200 Subject: [PATCH 08/21] s390/hmcdrv: Replace kmalloc() + copy_from_user() with memdup_user_nul() Replace kmalloc() followed by copy_from_user() with memdup_user_nul() to improve and simplify hmcdrv_dev_write(). Remove the manual NUL-termination. No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Alexander Gordeev --- drivers/s390/char/hmcdrv_dev.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c index e069dd685899..b26fcf6849f2 100644 --- a/drivers/s390/char/hmcdrv_dev.c +++ b/drivers/s390/char/hmcdrv_dev.c @@ -244,24 +244,17 @@ static ssize_t hmcdrv_dev_write(struct file *fp, const char __user *ubuf, size_t len, loff_t *pos) { ssize_t retlen; + void *pdata; pr_debug("writing file '/dev/%pD' at pos. %lld with length %zd\n", fp, (long long) *pos, len); if (!fp->private_data) { /* first expect a cmd write */ - fp->private_data = kmalloc(len + 1, GFP_KERNEL); - - if (!fp->private_data) - return -ENOMEM; - - if (!copy_from_user(fp->private_data, ubuf, len)) { - ((char *)fp->private_data)[len] = '\0'; - return len; - } - - kfree(fp->private_data); - fp->private_data = NULL; - return -EFAULT; + pdata = memdup_user_nul(ubuf, len); + if (IS_ERR(pdata)) + return PTR_ERR(pdata); + fp->private_data = pdata; + return len; } retlen = hmcdrv_dev_transfer((char *) fp->private_data, From 5b27dfb1d7b59db9e72766c990a3ee80e39e4f69 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 9 Sep 2025 15:46:26 +0200 Subject: [PATCH 09/21] s390/dcssblk: Add DAX support With ZONE_DEVICE now available for s390, struct pages can be allocated for proper DAX support in dcssblk driver via devm_memremap_pages(). Adding struct pages for a range requires that the range is aligned to SUBSECTION_SIZE, which is defined as 2 MB in common code. Therefore, only enable DAX support and allocate struct pages for DCSS ranges that are aligned to 2 MB. Signed-off-by: Gerald Schaefer Acked-by: Heiko Carstens Reviewed-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- drivers/s390/block/Kconfig | 12 ++---------- drivers/s390/block/dcssblk.c | 35 +++++++++++++++++++++++++++++------ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 8c1c908d2c6e..877a9bc7f04b 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -5,19 +5,11 @@ comment "S/390 block device drivers" config DCSSBLK def_tristate m prompt "DCSSBLK support" - depends on S390 && BLOCK && (DAX || DAX=n) + depends on S390 && BLOCK && ZONE_DEVICE + select FS_DAX help Support for dcss block device -config DCSSBLK_DAX - def_bool y - depends on DCSSBLK - # requires S390 ZONE_DEVICE support - depends on BROKEN - prompt "DCSSBLK DAX support" - help - Enable DAX operation for the dcss block device - config DASD def_tristate y prompt "Support for DASD devices" diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 94fa5edecadd..86fef4b15015 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -79,6 +79,8 @@ struct dcssblk_dev_info { int num_of_segments; struct list_head seg_list; struct dax_device *dax_dev; + struct dev_pagemap pgmap; + void *pgmap_addr; }; struct segment_info { @@ -415,6 +417,8 @@ removeseg: dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); @@ -537,9 +541,6 @@ static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info) { struct dax_device *dax_dev; - if (!IS_ENABLED(CONFIG_DCSSBLK_DAX)) - return 0; - dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); if (IS_ERR(dax_dev)) return PTR_ERR(dax_dev); @@ -562,6 +563,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; char *local_buf; + void *addr; unsigned long seg_byte_size; dev_info = NULL; @@ -687,9 +689,26 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; - rc = dcssblk_setup_dax(dev_info); - if (rc) - goto out_dax; + if (!IS_ALIGNED(dev_info->start, SUBSECTION_SIZE) || + !IS_ALIGNED(dev_info->end + 1, SUBSECTION_SIZE)) { + pr_info("DCSS %s is not aligned to %lu bytes, DAX support disabled\n", + local_buf, SUBSECTION_SIZE); + } else { + dev_info->pgmap.type = MEMORY_DEVICE_FS_DAX; + dev_info->pgmap.range.start = dev_info->start; + dev_info->pgmap.range.end = dev_info->end; + dev_info->pgmap.nr_range = 1; + addr = devm_memremap_pages(&dev_info->dev, &dev_info->pgmap); + if (IS_ERR(addr)) { + rc = PTR_ERR(addr); + goto put_dev; + } + dev_info->pgmap_addr = addr; + rc = dcssblk_setup_dax(dev_info); + if (rc) + goto out_dax; + pr_info("DAX support enabled for DCSS %s\n", local_buf); + } get_device(&dev_info->dev); rc = device_add_disk(&dev_info->dev, dev_info->gd, NULL); @@ -716,6 +735,8 @@ out_dax_host: out_dax: kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); put_dev: list_del(&dev_info->lh); put_disk(dev_info->gd); @@ -801,6 +822,8 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch dax_remove_host(dev_info->gd); kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); + if (dev_info->pgmap_addr) + devm_memunmap_pages(&dev_info->dev, &dev_info->pgmap); del_gendisk(dev_info->gd); put_disk(dev_info->gd); From a9f859b516ac98c06b0d24e691fceab32a9665d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:00 +0200 Subject: [PATCH 10/21] s390/bitops: Limit return value range of __flogr() With the recent ffs() and ffs64() optimization a logical AND operation was removed, which allowed the compiler to tell the return value range of both functions. This may lead to compile warnings as reported by the kernel test robot: drivers/infiniband/hw/mlx5/mr.c: In function 'mlx5r_cache_create_ent_locked': >> drivers/infiniband/hw/mlx5/mr.c:840:31: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=] 840 | sprintf(ent->name, "%d", order); | ^ In function 'mlx5_mkey_cache_debugfs_add_ent', inlined from 'mlx5r_cache_create_ent_locked' at drivers/infiniband/hw/mlx5/mr.c:930:3: drivers/infiniband/hw/mlx5/mr.c:840:9: note: 'sprintf' output between 2 and 5 bytes into a destination of size 4 840 | sprintf(ent->name, "%d", order); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Add the AND operation again to address the warning. From a correctness point of view the AND operation is not necessary, however there is no other way to tell the compiler that the returned value of the flogr inline assembly is in the range of 0..64. This increases the kernel image size by 566 bytes (defconfig, gcc 15.2.0). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508211859.UoYsJbLN-lkp@intel.com/ Fixes: de88e74889a3 ("s390/bitops: Slightly optimize ffs() and fls64()") Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 9dfb687ba620..9bc70acbac9e 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -169,7 +169,7 @@ static __always_inline unsigned char __flogr(unsigned long word) asm volatile( " flogr %[rp],%[rp]\n" : [rp] "+d" (rp.pair) : : "cc"); - return rp.even; + return rp.even & 127; } } From f72e2cff13aefe305fc8fc6afe4f43626e4ad88c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:01 +0200 Subject: [PATCH 11/21] compiler_types: Add __assume macro Make the statement attribute "assume" with a new __assume macro available. The assume attribute is used to indicate that a certain condition is assumed to be true. Compilers may or may not use this indication to generate optimized code. If this condition is violated at runtime, the behavior is undefined. Note that the clang documentation states that optimizers may react differently to this attribute, and this may even have a negative performance impact. Therefore this attribute should be used with care. Signed-off-by: Heiko Carstens Reviewed-by: Nathan Chancellor Signed-off-by: Alexander Gordeev --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ init/Kconfig | 10 ++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..2f3e80bf9f35 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -329,6 +329,29 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 diff --git a/init/Kconfig b/init/Kconfig index 836320251219..dabec90f18f6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -112,6 +112,16 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASSUME + bool + # clang needs to be at least 19.1.0 since the meaning of the assume + # attribute changed: + # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17 + default y if CC_IS_CLANG && CLANG_VERSION >= 190100 + # supported since gcc 13.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654 + default y if CC_IS_GCC && GCC_VERSION >= 130100 + config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) From 79161603952c842eb22313f2060051b359b0a592 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:02 +0200 Subject: [PATCH 12/21] s390/bitops: Use __assume() for __flogr() inline assembly return value Use __assume() to tell compilers that the output operand of the __flogr() inline assembly contains a value in the range of 0..64. This allows to optimize the logical AND operation away. This reduces the kernel image size by 2804 bytes (defconfig, gcc 15.2.0). Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 9bc70acbac9e..ac94672db817 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -132,9 +132,10 @@ static inline bool test_bit_inv(unsigned long nr, */ static __always_inline unsigned char __flogr(unsigned long word) { - if (__builtin_constant_p(word)) { - unsigned long bit = 0; + unsigned long bit; + if (__builtin_constant_p(word)) { + bit = 0; if (!word) return 64; if (!(word & 0xffffffff00000000UL)) { @@ -169,7 +170,14 @@ static __always_inline unsigned char __flogr(unsigned long word) asm volatile( " flogr %[rp],%[rp]\n" : [rp] "+d" (rp.pair) : : "cc"); - return rp.even & 127; + bit = rp.even; + /* + * The result of the flogr instruction is a value in the range + * of 0..64. Let the compiler know that the AND operation can + * be optimized away. + */ + __assume(bit <= 64); + return bit & 127; } } From f46ccdb87a2573a23ee2d2c21a6b087af9ae76c0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:03 +0200 Subject: [PATCH 13/21] s390/bitops: Cleanup __flogr() The flogr() inline assembly has no side effects and generates the same output if the input does not change. Therefore remove the volatile qualifier to allow the compiler to optimize the inline assembly away, if possible. Also remove the superfluous '\n' which makes the inline assembly appear larger than it is according to compiler heuristics (number of lines). Furthermore change the return type of flogr() to unsigned long and add the const attribute to the function. This reduces the kernel image size by 994 bytes (defconfig, gcc 15.2.0). Suggested-by: Juergen Christ Reviewed-by: Juergen Christ Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index ac94672db817..e643f24ebc05 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -130,7 +130,7 @@ static inline bool test_bit_inv(unsigned long nr, * where the most significant bit has bit number 0. * If no bit is set this function returns 64. */ -static __always_inline unsigned char __flogr(unsigned long word) +static __always_inline __attribute_const__ unsigned long __flogr(unsigned long word) { unsigned long bit; @@ -167,9 +167,8 @@ static __always_inline unsigned char __flogr(unsigned long word) union register_pair rp __uninitialized; rp.even = word; - asm volatile( - " flogr %[rp],%[rp]\n" - : [rp] "+d" (rp.pair) : : "cc"); + asm("flogr %[rp],%[rp]" + : [rp] "+d" (rp.pair) : : "cc"); bit = rp.even; /* * The result of the flogr instruction is a value in the range From e11727b2b0ca23d147c4d42f494a59aba0749c89 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 18 Sep 2025 16:53:41 +0200 Subject: [PATCH 14/21] s390/configs: Enable additional network features Enable AF_XDP, kTLS, and Mellanox subfunctions to accelerate network packet processing. Signed-off-by: Hendrik Brueckner Signed-off-by: Alexander Gordeev --- arch/s390/configs/debug_defconfig | 6 ++++++ arch/s390/configs/defconfig | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6b33429f1c4d..e68c56ff6fd8 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -114,8 +114,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -547,6 +552,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index b75eb2775850..d6be7372f41e 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -105,8 +105,13 @@ CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_TLS_TOE=y CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y @@ -537,6 +542,7 @@ CONFIG_NLMON=m CONFIG_MLX4_EN=m CONFIG_MLX5_CORE=m CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_SF=y # CONFIG_NET_VENDOR_META is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROCHIP is not set From 5671ce2a1fc6b4a16cff962423bc416b92cac3c8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 17:24:05 +0200 Subject: [PATCH 15/21] s390/mm: Use __GFP_ACCOUNT for user page table allocations Add missing kmemcg accounting of user page table allocations. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/mm/pgalloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fc..ad3e0f7f7fc1 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -16,9 +16,13 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + gfp_t gfp = GFP_KERNEL_ACCOUNT; + struct ptdesc *ptdesc; unsigned long *table; + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; table = ptdesc_to_virt(ptdesc); @@ -117,7 +121,7 @@ struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) struct ptdesc *ptdesc; u64 *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); @@ -136,10 +140,13 @@ void page_table_free_pgste(struct ptdesc *ptdesc) unsigned long *page_table_alloc(struct mm_struct *mm) { + gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; unsigned long *table; - ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) { From 7b80a23c0e33ae5a3ae68e0cf5b5a59e8a368c37 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 13:40:19 +0200 Subject: [PATCH 16/21] s390/bitops: Switch to generic fls(), fls64(), etc. Switch to generic fls(), fls64(), etc. which are implemented with __builtin_ctzl(), __builtin_clzl(). Those builtins are available for all supported compilers. Kernel image size is reduced by ~10kb (gcc 15.1.0 + defconfig). Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/bitops.h | 56 +++------------------------------- 1 file changed, 5 insertions(+), 51 deletions(-) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index e643f24ebc05..8b9060c26c52 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -180,17 +180,6 @@ static __always_inline __attribute_const__ unsigned long __flogr(unsigned long w } } -/** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static __always_inline __flatten unsigned long __ffs(unsigned long word) -{ - return __flogr(-word & word) ^ (BITS_PER_LONG - 1); -} - /** * ffs - find first bit set * @word: the word to search @@ -205,48 +194,13 @@ static __always_inline __flatten int ffs(int word) return BITS_PER_LONG - __flogr(-val & val); } -/** - * __fls - find last (most-significant) set bit in a long word - * @word: the word to search - * - * Undefined if no set bit exists, so code should check against 0 first. - */ -static __always_inline __flatten unsigned long __fls(unsigned long word) -{ - return __flogr(word) ^ (BITS_PER_LONG - 1); -} - -/** - * fls64 - find last set bit in a 64-bit word - * @word: the word to search - * - * This is defined in a similar way as the libc and compiler builtin - * ffsll, but returns the position of the most significant set bit. - * - * fls64(value) returns 0 if value is 0 or the position of the last - * set bit if value is nonzero. The last (most significant) bit is - * at position 64. - */ -static __always_inline __flatten int fls64(unsigned long word) -{ - return BITS_PER_LONG - __flogr(word); -} - -/** - * fls - find last (most-significant) bit set - * @word: the word to search - * - * This is defined the same way as ffs. - * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. - */ -static __always_inline __flatten int fls(unsigned int word) -{ - return fls64(word); -} - +#include +#include +#include +#include +#include #include #include -#include #include #include #include From 6c4e0cb3d87ad63a30e05e7624a45a6f01240e70 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 22 Sep 2025 13:40:20 +0200 Subject: [PATCH 17/21] s390/bitops: Switch to generic ffs() if supported by compiler Use generic ffs() / __builtin_ffs() if supported by the compiler. GCC 16 will have support for __builtin_ffs(). See gcc commit f50cff9766c5 ("s390: Implement clz and ctz for SI mode"). In the distant future when GCC 16 becomes the minimum supported version, this allows to get rid of the flogr inline assembly. Kernel image size is reduced by ~500 bytes (gcc 16 beta + defconfig). Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/Kconfig | 7 +++++++ arch/s390/include/asm/bitops.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33c..22862ce7ec68 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -49,6 +49,13 @@ config KASAN_SHADOW_OFFSET depends on KASAN default 0x1C000000000000 +config CC_HAS_BUILTIN_FFS + def_bool !(CC_IS_GCC && GCC_VERSION < 160000) + help + GCC versions before 16.0.0 generate library calls to ffs() + for __builtin_ffs() even when __has_builtin(__builtin_ffs) + is true. + config CC_ASM_FLAG_OUTPUT_BROKEN def_bool CC_IS_GCC && GCC_VERSION < 140200 help diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 8b9060c26c52..1564dd3a5a82 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -122,6 +122,8 @@ static inline bool test_bit_inv(unsigned long nr, return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); } +#ifndef CONFIG_CC_HAS_BUILTIN_FFS + /** * __flogr - find leftmost one * @word - The word to search @@ -194,6 +196,12 @@ static __always_inline __flatten int ffs(int word) return BITS_PER_LONG - __flogr(-val & val); } +#else /* CONFIG_CC_HAS_BUILTIN_FFS */ + +#include + +#endif /* CONFIG_CC_HAS_BUILTIN_FFS */ + #include #include #include From f707d2f7a0c7793406daf0e223bad01bb748343e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:57 +0200 Subject: [PATCH 18/21] s390/tape: Add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Alexander Gordeev Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- drivers/s390/char/tape_3590.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c index a1bafaf73f87..2a2931d303cb 100644 --- a/drivers/s390/char/tape_3590.c +++ b/drivers/s390/char/tape_3590.c @@ -1671,7 +1671,7 @@ tape_3590_init(void) DBF_EVENT(3, "3590 init\n"); - tape_3590_wq = alloc_workqueue("tape_3590", 0, 0); + tape_3590_wq = alloc_workqueue("tape_3590", WQ_PERCPU, 0); if (!tape_3590_wq) return -ENOMEM; From dbfe205a344a865b9c36706738f45bc554a040c7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:58 +0200 Subject: [PATCH 19/21] s390/diag324: Replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by renaming system_wq to system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Heiko Carstens Signed-off-by: Alexander Gordeev --- arch/s390/kernel/diag/diag324.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/diag/diag324.c b/arch/s390/kernel/diag/diag324.c index 7fa4c0b7eb6c..f0a8b4841fb9 100644 --- a/arch/s390/kernel/diag/diag324.c +++ b/arch/s390/kernel/diag/diag324.c @@ -116,7 +116,7 @@ static void pibwork_handler(struct work_struct *work) mutex_lock(&pibmutex); timedout = ktime_add_ns(data->expire, PIBWORK_DELAY); if (ktime_before(ktime_get(), timedout)) { - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); goto out; } vfree(data->pib); @@ -174,7 +174,7 @@ long diag324_pibbuf(unsigned long arg) pib_update(data); data->sequence++; data->expire = ktime_add_ns(ktime_get(), tod_to_ns(data->pib->intv)); - mod_delayed_work(system_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); + mod_delayed_work(system_percpu_wq, &pibwork, nsecs_to_jiffies(PIBWORK_DELAY)); first = false; } rc = data->rc; From 72105fc1c1cb67e779fe2da9d22ffae189c00cfc Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 17 Sep 2025 17:38:59 +0200 Subject: [PATCH 20/21] s390: Replace use of system_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue (replaced by system_percpu_wq), but the current code does not benefit from it. Because of that, system_wq has been replaced by system_dfl_wq, the new unbound workqueue. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Acked-by: Heiko Carstens Reviewed-by: Mete Durlu Signed-off-by: Alexander Gordeev --- arch/s390/kernel/hiperdispatch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c index e7b66d046e8d..2507bc3f7757 100644 --- a/arch/s390/kernel/hiperdispatch.c +++ b/arch/s390/kernel/hiperdispatch.c @@ -191,7 +191,7 @@ int hd_enable_hiperdispatch(void) return 0; if (hd_online_cores <= hd_entitled_cores) return 0; - mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + mod_delayed_work(system_dfl_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); hd_update_capacities(); return 1; } From 088bb10e37252034ec58a6152f20bfdc8a837f54 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Sep 2025 17:34:30 +0200 Subject: [PATCH 21/21] s390/mm: Add memory allocation profiling hooks Similar to common code changes [1] add alloc_hook() wrappers to page table allocation functions to allow for memory allocation profiling. If CONFIG_MEM_ALLOC_PROFILING is enabled call sites of page table allocations are accounted, instead of e.g. only crst_table_alloc() and page_table_alloc(). This allows for slightly better profiling data, and the output of /proc/allocinfo is similar to other architectures. Without alloc_hook() wrappers the output of /proc/allocinfo looks like this: 17096704 4174 mm/memory.c:1061 func:folio_prealloc 17809408 4348 mm/memory.c:1063 func:folio_prealloc 0 0 mm/memory.c:4422 func:alloc_swap_folio 0 0 mm/memory.c:4286 func:__alloc_swap_folio 0 0 mm/memory.c:4971 func:alloc_anon_folio ... 1589248 97 arch/s390/mm/pgalloc.c:25 func:crst_table_alloc 0 0 arch/s390/mm/pgalloc.c:124 func:page_table_alloc_pgste 4280320 1045 arch/s390/mm/pgalloc.c:149 func:page_table_alloc With alloc_hook() wrappers: 1097728 268 mm/memory.c:5147 func:__do_fault 20119552 4912 mm/memory.c:1061 func:folio_prealloc 17534976 4281 mm/memory.c:1063 func:folio_prealloc 0 0 mm/memory.c:4422 func:alloc_swap_folio 0 0 mm/memory.c:4286 func:__alloc_swap_folio 786432 192 mm/memory.c:452 func:__pte_alloc 405504 99 mm/memory.c:464 func:__pte_alloc_kernel 1880064 459 mm/memory.c:5525 func:do_fault_around 0 0 mm/memory.c:6403 func:__p4d_alloc 0 0 mm/memory.c:6426 func:__pud_alloc 1064960 65 mm/memory.c:6450 func:__pmd_alloc 0 0 mm/memory.c:4971 func:alloc_anon_folio 0 0 mm/memory.c:5233 func:do_set_pmd [1] commit 2c321f3f70bc ("mm: change inlined allocation helpers to account at the call site") Signed-off-by: Heiko Carstens Acked-by: Alexander Gordeev Signed-off-by: Alexander Gordeev --- arch/s390/include/asm/pgalloc.h | 30 +++++++++++++++++++----------- arch/s390/mm/pgalloc.c | 12 ++++++------ 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 5345398df653..a16e65072371 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -19,12 +19,16 @@ #define CRST_ALLOC_ORDER 2 -unsigned long *crst_table_alloc(struct mm_struct *); +unsigned long *crst_table_alloc_noprof(struct mm_struct *); +#define crst_table_alloc(...) alloc_hooks(crst_table_alloc_noprof(__VA_ARGS__)) void crst_table_free(struct mm_struct *, unsigned long *); -unsigned long *page_table_alloc(struct mm_struct *); -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm); +unsigned long *page_table_alloc_noprof(struct mm_struct *); +#define page_table_alloc(...) alloc_hooks(page_table_alloc_noprof(__VA_ARGS__)) void page_table_free(struct mm_struct *, unsigned long *); + +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm); +#define page_table_alloc_pgste(...) alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__)) void page_table_free_pgste(struct ptdesc *ptdesc); static inline void crst_table_init(unsigned long *crst, unsigned long entry) @@ -48,9 +52,9 @@ static inline unsigned long check_asce_limit(struct mm_struct *mm, unsigned long return addr; } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -59,6 +63,7 @@ static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long address) return (p4d_t *) table; } +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { @@ -69,9 +74,9 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) crst_table_free(mm, (unsigned long *) p4d); } -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long address) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -80,6 +85,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) return (pud_t *) table; } +#define pud_alloc_one(...) alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__)) static inline void pud_free(struct mm_struct *mm, pud_t *pud) { @@ -90,9 +96,9 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) crst_table_free(mm, (unsigned long *) pud); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long vmaddr) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -103,6 +109,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) } return (pmd_t *) table; } +#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { @@ -127,9 +134,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud(_REGION3_ENTRY | __pa(pmd))); } -static inline pgd_t *pgd_alloc(struct mm_struct *mm) +static inline pgd_t *pgd_alloc_noprof(struct mm_struct *mm) { - unsigned long *table = crst_table_alloc(mm); + unsigned long *table = crst_table_alloc_noprof(mm); if (!table) return NULL; @@ -137,6 +144,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) return (pgd_t *) table; } +#define pgd_alloc(...) alloc_hooks(pgd_alloc_noprof(__VA_ARGS__)) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index ad3e0f7f7fc1..36700384fe6b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -14,7 +14,7 @@ #include #include -unsigned long *crst_table_alloc(struct mm_struct *mm) +unsigned long *crst_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; @@ -22,7 +22,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - ptdesc = pagetable_alloc(gfp, CRST_ALLOC_ORDER); + ptdesc = pagetable_alloc_noprof(gfp, CRST_ALLOC_ORDER); if (!ptdesc) return NULL; table = ptdesc_to_virt(ptdesc); @@ -116,12 +116,12 @@ err_p4d: #ifdef CONFIG_PGSTE -struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm) { struct ptdesc *ptdesc; u64 *table; - ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); + ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0); if (ptdesc) { table = (u64 *)ptdesc_to_virt(ptdesc); __arch_set_page_dat(table, 1); @@ -138,7 +138,7 @@ void page_table_free_pgste(struct ptdesc *ptdesc) #endif /* CONFIG_PGSTE */ -unsigned long *page_table_alloc(struct mm_struct *mm) +unsigned long *page_table_alloc_noprof(struct mm_struct *mm) { gfp_t gfp = GFP_KERNEL_ACCOUNT; struct ptdesc *ptdesc; @@ -146,7 +146,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - ptdesc = pagetable_alloc(gfp, 0); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; if (!pagetable_pte_ctor(mm, ptdesc)) {