Commit 4b5bc2ec authored by David Woodhouse's avatar David Woodhouse Committed by Ingo Molnar
Browse files

x86/kexec: Allocate PGD for x86_64 transition page tables separately



Now that the following fix:

  d0ceea66 ("x86/mm: Add _PAGE_NOPTISHADOW bit to avoid updating userspace page tables")

stops kernel_ident_mapping_init() from scribbling over the end of a
4KiB PGD by assuming the following 4KiB will be a userspace PGD,
there's no good reason for the kexec PGD to be part of a single
8KiB allocation with the control_code_page.

( It's not clear that that was the reason for x86_64 kexec doing it that
  way in the first place either; there were no comments to that effect and
  it seems to have been the case even before PTI came along. It looks like
  it was just a happy accident which prevented memory corruption on kexec. )

Either way, it definitely isn't needed now. Just allocate the PGD
separately on x86_64, like i386 already does.

Signed-off-by: default avatarDavid Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://lore.kernel.org/r/20241205153343.3275139-6-dwmw2@infradead.org
parent 9e5683e2
Loading
Loading
Loading
Loading
+14 −4
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
# define PAGES_NR		4
#endif

# define KEXEC_CONTROL_PAGE_SIZE	4096
# define KEXEC_CONTROL_CODE_MAX_SIZE	2048

#ifndef __ASSEMBLY__
@@ -43,7 +44,6 @@ struct kimage;
/* Maximum address we can use for the control code buffer */
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE

# define KEXEC_CONTROL_PAGE_SIZE	4096

/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -58,9 +58,6 @@ struct kimage;
/* Maximum address we can use for the control pages */
# define KEXEC_CONTROL_MEMORY_LIMIT     (MAXMEM-1)

/* Allocate one page for the pdp and the second for the code */
# define KEXEC_CONTROL_PAGE_SIZE  (4096UL + 4096UL)

/* The native architecture */
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
@@ -145,6 +142,19 @@ struct kimage_arch {
};
#else
struct kimage_arch {
	/*
	 * This is a kimage control page, as it must not overlap with either
	 * source or destination address ranges.
	 */
	pgd_t *pgd;
	/*
	 * The virtual mapping of the control code page itself is used only
	 * during the transition, while the current kernel's pages are all
	 * in place. Thus the intermediate page table pages used to map it
	 * are not control pages, but instead just normal pages obtained
	 * with get_zeroed_page(). And have to be tracked (below) so that
	 * they can be freed.
	 */
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
+24 −21
Original line number Diff line number Diff line
@@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image)
	image->arch.pte = NULL;
}

static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
				   unsigned long control_page)
{
	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
	unsigned long vaddr, paddr;
@@ -157,7 +158,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
	pte_t *pte;

	vaddr = (unsigned long)relocate_kernel;
	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
	paddr = control_page;
	pgd += pgd_index(vaddr);
	if (!pgd_present(*pgd)) {
		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -216,7 +217,7 @@ static void *alloc_pgt_page(void *data)
	return p;
}

static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
static int init_pgtable(struct kimage *image, unsigned long control_page)
{
	struct x86_mapping_info info = {
		.alloc_pgt_page	= alloc_pgt_page,
@@ -225,12 +226,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
		.kernpg_flag	= _KERNPG_TABLE_NOENC,
	};
	unsigned long mstart, mend;
	pgd_t *level4p;
	int result;
	int i;

	level4p = (pgd_t *)__va(start_pgtable);
	clear_page(level4p);
	image->arch.pgd = alloc_pgt_page(image);
	if (!image->arch.pgd)
		return -ENOMEM;

	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
		info.page_flag   |= _PAGE_ENC;
@@ -244,8 +245,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
		mstart = pfn_mapped[i].start << PAGE_SHIFT;
		mend   = pfn_mapped[i].end << PAGE_SHIFT;

		result = kernel_ident_mapping_init(&info,
						 level4p, mstart, mend);
		result = kernel_ident_mapping_init(&info, image->arch.pgd,
						   mstart, mend);
		if (result)
			return result;
	}
@@ -260,8 +261,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;

		result = kernel_ident_mapping_init(&info,
						 level4p, mstart, mend);
		result = kernel_ident_mapping_init(&info, image->arch.pgd,
						   mstart, mend);

		if (result)
			return result;
@@ -271,15 +272,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
	 * Prepare EFI systab and ACPI tables for kexec kernel since they are
	 * not covered by pfn_mapped.
	 */
	result = map_efi_systab(&info, level4p);
	result = map_efi_systab(&info, image->arch.pgd);
	if (result)
		return result;

	result = map_acpi_tables(&info, level4p);
	result = map_acpi_tables(&info, image->arch.pgd);
	if (result)
		return result;

	return init_transition_pgtable(image, level4p);
	/*
	 * This must be last because the intermediate page table pages it
	 * allocates will not be control pages and may overlap the image.
	 */
	return init_transition_pgtable(image, image->arch.pgd, control_page);
}

static void load_segments(void)
@@ -296,14 +301,14 @@ static void load_segments(void)

int machine_kexec_prepare(struct kimage *image)
{
	unsigned long start_pgtable;
	unsigned long control_page;
	int result;

	/* Calculate the offsets */
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
	control_page = page_to_pfn(image->control_code_page) << PAGE_SHIFT;

	/* Setup the identity mapped 64bit page table */
	result = init_pgtable(image, start_pgtable);
	result = init_pgtable(image, control_page);
	if (result)
		return result;

@@ -357,13 +362,12 @@ void machine_kexec(struct kimage *image)
#endif
	}

	control_page = page_address(image->control_code_page) + PAGE_SIZE;
	control_page = page_address(image->control_code_page);
	__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);

	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
	page_list[PA_TABLE_PAGE] =
	  (unsigned long)__pa(page_address(image->control_code_page));
	page_list[PA_TABLE_PAGE] = (unsigned long)__pa(image->arch.pgd);

	if (image->type == KEXEC_TYPE_DEFAULT)
		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
@@ -573,8 +577,7 @@ static void kexec_mark_crashkres(bool protect)

	/* Don't touch the control code page used in crash_kexec().*/
	control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
	/* Control code page is located in the 2nd page. */
	kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
	kexec_mark_range(crashk_res.start, control - 1, protect);
	control += KEXEC_CONTROL_PAGE_SIZE;
	kexec_mark_range(control, crashk_res.end, protect);
}