Unverified Commit be17c0df authored by Samuel Holland's avatar Samuel Holland Committed by Palmer Dabbelt
Browse files

riscv: module: Optimize PLT/GOT entry counting



perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.58%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e03409 ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.
This allows accumulating PLT/GOT relocations across sections so sorting
and counting is only done once per module.

Signed-off-by: default avatarSamuel Holland <samuel.holland@sifive.com>
Reviewed-by: default avatarAndrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250409171526.862481-3-samuel.holland@sifive.com


Signed-off-by: default avatarAlexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: default avatarPalmer Dabbelt <palmer@dabbelt.com>
parent 881dadf0
Loading
Loading
Loading
Loading
+65 −16
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/sort.h>

unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
{
@@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
	return (unsigned long)&plt[i];
}

static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))

static int cmp_rela(const void *a, const void *b)
{
	return x->r_info == y->r_info && x->r_addend == y->r_addend;
	const Elf_Rela *x = a, *y = b;
	int i;

	/* sort by type, symbol index and addend */
	i = cmp_3way(x->r_info, y->r_info);
	if (i == 0)
		i = cmp_3way(x->r_addend, y->r_addend);
	return i;
}

static bool duplicate_rela(const Elf_Rela *rela, int idx)
{
	int i;
	for (i = 0; i < idx; i++) {
		if (is_rela_equal(&rela[i], &rela[idx]))
			return true;
	}
	return false;
	/*
	 * Entries are sorted by type, symbol index and addend. That means
	 * that, if a duplicate entry exists, it must be in the preceding slot.
	 */
	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
}

static void count_max_entries(Elf_Rela *relas, int num,
static void count_max_entries(const Elf_Rela *relas, size_t num,
			      unsigned int *plts, unsigned int *gots)
{
	for (int i = 0; i < num; i++) {
	for (size_t i = 0; i < num; i++) {
		if (duplicate_rela(relas, i))
			continue;

		switch (ELF_R_TYPE(relas[i].r_info)) {
		case R_RISCV_CALL_PLT:
		case R_RISCV_PLT32:
			if (!duplicate_rela(relas, i))
			(*plts)++;
			break;
		case R_RISCV_GOT_HI20:
			if (!duplicate_rela(relas, i))
			(*gots)++;
			break;
		default:
			unreachable();
		}
	}
}

static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
{
	switch (ELF_R_TYPE(rela->r_info)) {
	case R_RISCV_CALL_PLT:
	case R_RISCV_GOT_HI20:
	case R_RISCV_PLT32:
		return true;
	default:
		return false;
	}
}

int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
			      char *secstrings, struct module *mod)
{
	size_t num_scratch_relas = 0;
	unsigned int num_plts = 0;
	unsigned int num_gots = 0;
	Elf_Rela *scratch = NULL;
	size_t scratch_size = 0;
	int i;

	/*
@@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,

	/* Calculate the maxinum number of entries */
	for (i = 0; i < ehdr->e_shnum; i++) {
		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
		size_t scratch_size_needed;

		if (sechdrs[i].sh_type != SHT_RELA)
			continue;
@@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
			continue;

		count_max_entries(relas, num_rela, &num_plts, &num_gots);
		/*
		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
		 * close together, so sort a copy of the section to avoid interfering.
		 */
		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
		if (scratch_size_needed > scratch_size) {
			scratch_size = scratch_size_needed;
			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
			if (!scratch)
				return -ENOMEM;
		}

		for (size_t j = 0; j < num_relas; j++)
			if (rela_needs_plt_got_entry(&relas[j]))
				scratch[num_scratch_relas++] = relas[j];
	}

	if (scratch) {
		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
		kvfree(scratch);
	}

	mod->arch.plt.shdr->sh_type = SHT_NOBITS;