Files
linux-net/drivers/iommu/generic_pt/fmt/amdv1.h
Wei Wang e2692c4eea iommupt: Do not set C-bit on MMIO backed PTEs
AMD Secure Memory Encryption (SME) marks individual memory pages as
encrypted by setting the C-bit in page table entries. According to the
AMD APM,any pages corresponding to MMIO addresses must be configured
with the C-bit clear.

The current *_iommu_set_prot() implementation sets the C-bit on all PTEs
in the IOMMU page tables. This is incorrect for PTEs backed by MMIO, and
can break PCIe peer-to-peer communication when IOVA is used. Fix this by
avoiding the C-bit for MMIO-backed mappings.

For amdv2 IOMMU page tables, there is a usage scenario for GVA->GPA
mappings, and for the trusted MMIO in the TEE-IO case, the C-bit will need
to be added to GPA. However, SNP guests do not yet support vIOMMU, and the
trusted MMIO support is not ready in upstream. Adding the C-bit for trusted
MMIO can be considered once those features land.

Fixes: 879ced2bab ("iommupt: Add the AMD IOMMU v1 page table format")
Fixes: aef5de756e ("iommupt: Add the x86 64 bit page table format")
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Wei Wang <wei.w.wang@hotmail.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
2026-01-19 10:19:54 +01:00

413 lines
12 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
*
* AMD IOMMU v1 page table
*
* This is described in Section "2.2.3 I/O Page Tables for Host Translations"
* of the "AMD I/O Virtualization Technology (IOMMU) Specification"
*
* Note the level numbering here matches the core code, so level 0 is the same
* as mode 1.
*
*/
#ifndef __GENERIC_PT_FMT_AMDV1_H
#define __GENERIC_PT_FMT_AMDV1_H
#include "defs_amdv1.h"
#include "../pt_defs.h"
#include <asm/page.h>
#include <linux/bitfield.h>
#include <linux/container_of.h>
#include <linux/mem_encrypt.h>
#include <linux/minmax.h>
#include <linux/sizes.h>
#include <linux/string.h>
enum {
PT_ITEM_WORD_SIZE = sizeof(u64),
/*
* The IOMMUFD selftest uses the AMDv1 format with some alterations It
* uses a 2k page size to test cases where the CPU page size is not the
* same.
*/
#ifdef AMDV1_IOMMUFD_SELFTEST
PT_MAX_VA_ADDRESS_LG2 = 56,
PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
PT_MAX_TOP_LEVEL = 4,
PT_GRANULE_LG2SZ = 11,
#else
PT_MAX_VA_ADDRESS_LG2 = 64,
PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
PT_MAX_TOP_LEVEL = 5,
PT_GRANULE_LG2SZ = 12,
#endif
PT_TABLEMEM_LG2SZ = 12,
/* The DTE only has these bits for the top phyiscal address */
PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
};
/* PTE bits */
enum {
AMDV1PT_FMT_PR = BIT(0),
AMDV1PT_FMT_D = BIT(6),
AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
AMDV1PT_FMT_FC = BIT_ULL(60),
AMDV1PT_FMT_IR = BIT_ULL(61),
AMDV1PT_FMT_IW = BIT_ULL(62),
};
/*
* gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
* these defines to avoid it.
*/
#define AMDV1PT_FMT_NL_DEFAULT 0
#define AMDV1PT_FMT_NL_SIZE 7
static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
{
u64 entry = pts->entry;
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
entry = __sme_clr(entry);
return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
}
#define pt_table_pa amdv1pt_table_pa
/* Returns the oa for the start of the contiguous entry */
static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
{
u64 entry = pts->entry;
pt_oaddr_t oa;
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
entry = __sme_clr(entry);
oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
unsigned int sz_bits = oaffz(oa);
oa = oalog2_set_mod(oa, 0, sz_bits);
} else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
AMDV1PT_FMT_NL_DEFAULT))
return 0;
return oalog2_mul(oa, PT_GRANULE_LG2SZ);
}
#define pt_entry_oa amdv1pt_entry_oa
static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
{
/*
* Table 15: Page Table Level Parameters
* The top most level cannot have translation entries
*/
return pts->level < PT_MAX_TOP_LEVEL;
}
#define pt_can_have_leaf amdv1pt_can_have_leaf
/* Body in pt_fmt_defaults.h */
static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
static inline unsigned int
amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
{
u32 code;
if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
AMDV1PT_FMT_NL_DEFAULT)
return ilog2(1);
PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
AMDV1PT_FMT_NL_SIZE);
/*
* The contiguous size is encoded in the length of a string of 1's in
* the low bits of the OA. Reverse the equation:
* code = log2_to_int(num_contig_lg2 + item_lg2sz -
* PT_GRANULE_LG2SZ - 1) - 1
* Which can be expressed as:
* num_contig_lg2 = oalog2_ffz(code) + 1 -
* item_lg2sz - PT_GRANULE_LG2SZ
*
* Assume the bit layout is correct and remove the masking. Reorganize
* the equation to move all the arithmetic before the ffz.
*/
code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
return ffz_t(u32, code);
}
#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
{
/*
* Top entry covers bits [63:57] only, this is handled through
* max_vasz_lg2.
*/
if (PT_WARN_ON(pts->level == 5))
return 7;
return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
}
#define pt_num_items_lg2 amdv1pt_num_items_lg2
static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
{
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
if (!amdv1pt_can_have_leaf(pts))
return 0;
/*
* Table 14: Example Page Size Encodings
* Address bits 51:32 can be used to encode page sizes greater than 4
* Gbytes. Address bits 63:52 are zero-extended.
*
* 512GB Pages are not supported due to a hardware bug.
* Otherwise every power of two size is supported.
*/
return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
isz_lg2) & ~SZ_512G;
}
#define pt_possible_sizes amdv1pt_possible_sizes
static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
{
const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
unsigned int next_level;
u64 entry;
pts->entry = entry = READ_ONCE(*tablep);
if (!(entry & AMDV1PT_FMT_PR))
return PT_ENTRY_EMPTY;
next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
next_level == AMDV1PT_FMT_NL_SIZE)
return PT_ENTRY_OA;
return PT_ENTRY_TABLE;
}
#define pt_load_entry_raw amdv1pt_load_entry_raw
static inline void
amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
unsigned int oasz_lg2,
const struct pt_write_attrs *attrs)
{
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
u64 entry;
if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
return;
entry = AMDV1PT_FMT_PR |
FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
attrs->descriptor_bits;
if (oasz_lg2 == isz_lg2) {
entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
AMDV1PT_FMT_NL_DEFAULT);
WRITE_ONCE(*tablep, entry);
} else {
unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
u64 *end = tablep + log2_to_int(num_contig_lg2);
entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
AMDV1PT_FMT_NL_SIZE) |
FIELD_PREP(AMDV1PT_FMT_OA,
oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
1) -
1);
/* See amdv1pt_clear_entries() */
if (num_contig_lg2 <= ilog2(32)) {
for (; tablep != end; tablep++)
WRITE_ONCE(*tablep, entry);
} else {
memset64(tablep, entry, log2_to_int(num_contig_lg2));
}
}
pts->entry = entry;
}
#define pt_install_leaf_entry amdv1pt_install_leaf_entry
static inline bool amdv1pt_install_table(struct pt_state *pts,
pt_oaddr_t table_pa,
const struct pt_write_attrs *attrs)
{
u64 entry;
/*
* IR and IW are ANDed from the table levels along with the PTE. We
* always control permissions from the PTE, so always set IR and IW for
* tables.
*/
entry = AMDV1PT_FMT_PR |
FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
FIELD_PREP(AMDV1PT_FMT_OA,
log2_div(table_pa, PT_GRANULE_LG2SZ)) |
AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
entry = __sme_set(entry);
return pt_table_install64(pts, entry);
}
#define pt_install_table amdv1pt_install_table
static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
struct pt_write_attrs *attrs)
{
attrs->descriptor_bits =
pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
}
#define pt_attr_from_entry amdv1pt_attr_from_entry
static inline void amdv1pt_clear_entries(struct pt_state *pts,
unsigned int num_contig_lg2)
{
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
u64 *end = tablep + log2_to_int(num_contig_lg2);
/*
* gcc generates rep stos for the io-pgtable code, and this difference
* can show in microbenchmarks with larger contiguous page sizes.
* rep is slower for small cases.
*/
if (num_contig_lg2 <= ilog2(32)) {
for (; tablep != end; tablep++)
WRITE_ONCE(*tablep, 0);
} else {
memset64(tablep, 0, log2_to_int(num_contig_lg2));
}
}
#define pt_clear_entries amdv1pt_clear_entries
static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
{
unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
u64 *tablep = pt_cur_table(pts, u64) +
log2_set_mod(pts->index, 0, num_contig_lg2);
u64 *end = tablep + log2_to_int(num_contig_lg2);
for (; tablep != end; tablep++)
if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
return true;
return false;
}
#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
{
unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
u64 *tablep = pt_cur_table(pts, u64) +
log2_set_mod(pts->index, 0, num_contig_lg2);
u64 *end = tablep + log2_to_int(num_contig_lg2);
for (; tablep != end; tablep++)
WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
}
#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
{
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
u64 new = pts->entry | AMDV1PT_FMT_D;
return try_cmpxchg64(tablep, &pts->entry, new);
}
#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
/* --- iommu */
#include <linux/generic_pt/iommu.h>
#include <linux/iommu.h>
#define pt_iommu_table pt_iommu_amdv1
/* The common struct is in the per-format common struct */
static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
{
return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
->amdpt.common;
}
static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
{
return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
}
static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
struct pt_write_attrs *attrs,
unsigned int iommu_prot)
{
u64 pte = 0;
if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
pte |= AMDV1PT_FMT_FC;
if (iommu_prot & IOMMU_READ)
pte |= AMDV1PT_FMT_IR;
if (iommu_prot & IOMMU_WRITE)
pte |= AMDV1PT_FMT_IW;
/*
* Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
* control this. For now if the tables use sme_set then so do the ptes.
*/
if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) &&
!(iommu_prot & IOMMU_MMIO))
pte = __sme_set(pte);
attrs->descriptor_bits = pte;
return 0;
}
#define pt_iommu_set_prot amdv1pt_iommu_set_prot
static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
const struct pt_iommu_amdv1_cfg *cfg)
{
struct pt_amdv1 *table = &iommu_table->amdpt;
unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
return -EINVAL;
if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
cfg->starting_level != PT_MAX_TOP_LEVEL)
max_vasz_lg2 = PT_GRANULE_LG2SZ +
(PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
(cfg->starting_level + 1);
table->common.max_vasz_lg2 =
min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
table->common.max_oasz_lg2 =
min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
pt_top_set_level(&table->common, cfg->starting_level);
return 0;
}
#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
#ifndef PT_FMT_VARIANT
static inline void
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
const struct pt_range *top_range,
struct pt_iommu_amdv1_hw_info *info)
{
info->host_pt_root = virt_to_phys(top_range->top_table);
PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
info->mode = top_range->top_level + 1;
}
#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
#endif
#if defined(GENERIC_PT_KUNIT)
static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
/* Matches what io_pgtable does */
[0] = { .starting_level = 2 },
};
#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
enum { KUNIT_FMT_FEATURES = 0 };
#endif
#endif