mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-05 00:07:48 -04:00
AMD Secure Memory Encryption (SME) marks individual memory pages as encrypted by setting the C-bit in page table entries. According to the AMD APM,any pages corresponding to MMIO addresses must be configured with the C-bit clear. The current *_iommu_set_prot() implementation sets the C-bit on all PTEs in the IOMMU page tables. This is incorrect for PTEs backed by MMIO, and can break PCIe peer-to-peer communication when IOVA is used. Fix this by avoiding the C-bit for MMIO-backed mappings. For amdv2 IOMMU page tables, there is a usage scenario for GVA->GPA mappings, and for the trusted MMIO in the TEE-IO case, the C-bit will need to be added to GPA. However, SNP guests do not yet support vIOMMU, and the trusted MMIO support is not ready in upstream. Adding the C-bit for trusted MMIO can be considered once those features land. Fixes:879ced2bab("iommupt: Add the AMD IOMMU v1 page table format") Fixes:aef5de756e("iommupt: Add the x86 64 bit page table format") Suggested-by: Jason Gunthorpe <jgg@nvidia.com> Signed-off-by: Wei Wang <wei.w.wang@hotmail.com> Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Vasant Hegde <vasant.hegde@amd.com> Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
413 lines
12 KiB
C
413 lines
12 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
|
|
*
|
|
* AMD IOMMU v1 page table
|
|
*
|
|
* This is described in Section "2.2.3 I/O Page Tables for Host Translations"
|
|
* of the "AMD I/O Virtualization Technology (IOMMU) Specification"
|
|
*
|
|
* Note the level numbering here matches the core code, so level 0 is the same
|
|
* as mode 1.
|
|
*
|
|
*/
|
|
#ifndef __GENERIC_PT_FMT_AMDV1_H
|
|
#define __GENERIC_PT_FMT_AMDV1_H
|
|
|
|
#include "defs_amdv1.h"
|
|
#include "../pt_defs.h"
|
|
|
|
#include <asm/page.h>
|
|
#include <linux/bitfield.h>
|
|
#include <linux/container_of.h>
|
|
#include <linux/mem_encrypt.h>
|
|
#include <linux/minmax.h>
|
|
#include <linux/sizes.h>
|
|
#include <linux/string.h>
|
|
|
|
enum {
|
|
PT_ITEM_WORD_SIZE = sizeof(u64),
|
|
/*
|
|
* The IOMMUFD selftest uses the AMDv1 format with some alterations It
|
|
* uses a 2k page size to test cases where the CPU page size is not the
|
|
* same.
|
|
*/
|
|
#ifdef AMDV1_IOMMUFD_SELFTEST
|
|
PT_MAX_VA_ADDRESS_LG2 = 56,
|
|
PT_MAX_OUTPUT_ADDRESS_LG2 = 51,
|
|
PT_MAX_TOP_LEVEL = 4,
|
|
PT_GRANULE_LG2SZ = 11,
|
|
#else
|
|
PT_MAX_VA_ADDRESS_LG2 = 64,
|
|
PT_MAX_OUTPUT_ADDRESS_LG2 = 52,
|
|
PT_MAX_TOP_LEVEL = 5,
|
|
PT_GRANULE_LG2SZ = 12,
|
|
#endif
|
|
PT_TABLEMEM_LG2SZ = 12,
|
|
|
|
/* The DTE only has these bits for the top phyiscal address */
|
|
PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12),
|
|
};
|
|
|
|
/* PTE bits */
|
|
enum {
|
|
AMDV1PT_FMT_PR = BIT(0),
|
|
AMDV1PT_FMT_D = BIT(6),
|
|
AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9),
|
|
AMDV1PT_FMT_OA = GENMASK_ULL(51, 12),
|
|
AMDV1PT_FMT_FC = BIT_ULL(60),
|
|
AMDV1PT_FMT_IR = BIT_ULL(61),
|
|
AMDV1PT_FMT_IW = BIT_ULL(62),
|
|
};
|
|
|
|
/*
|
|
* gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make
|
|
* these defines to avoid it.
|
|
*/
|
|
#define AMDV1PT_FMT_NL_DEFAULT 0
|
|
#define AMDV1PT_FMT_NL_SIZE 7
|
|
|
|
static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts)
|
|
{
|
|
u64 entry = pts->entry;
|
|
|
|
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
|
|
entry = __sme_clr(entry);
|
|
return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ);
|
|
}
|
|
#define pt_table_pa amdv1pt_table_pa
|
|
|
|
/* Returns the oa for the start of the contiguous entry */
|
|
static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts)
|
|
{
|
|
u64 entry = pts->entry;
|
|
pt_oaddr_t oa;
|
|
|
|
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
|
|
entry = __sme_clr(entry);
|
|
oa = FIELD_GET(AMDV1PT_FMT_OA, entry);
|
|
|
|
if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) {
|
|
unsigned int sz_bits = oaffz(oa);
|
|
|
|
oa = oalog2_set_mod(oa, 0, sz_bits);
|
|
} else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) !=
|
|
AMDV1PT_FMT_NL_DEFAULT))
|
|
return 0;
|
|
return oalog2_mul(oa, PT_GRANULE_LG2SZ);
|
|
}
|
|
#define pt_entry_oa amdv1pt_entry_oa
|
|
|
|
static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts)
|
|
{
|
|
/*
|
|
* Table 15: Page Table Level Parameters
|
|
* The top most level cannot have translation entries
|
|
*/
|
|
return pts->level < PT_MAX_TOP_LEVEL;
|
|
}
|
|
#define pt_can_have_leaf amdv1pt_can_have_leaf
|
|
|
|
/* Body in pt_fmt_defaults.h */
|
|
static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts);
|
|
|
|
static inline unsigned int
|
|
amdv1pt_entry_num_contig_lg2(const struct pt_state *pts)
|
|
{
|
|
u32 code;
|
|
|
|
if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) ==
|
|
AMDV1PT_FMT_NL_DEFAULT)
|
|
return ilog2(1);
|
|
|
|
PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) !=
|
|
AMDV1PT_FMT_NL_SIZE);
|
|
|
|
/*
|
|
* The contiguous size is encoded in the length of a string of 1's in
|
|
* the low bits of the OA. Reverse the equation:
|
|
* code = log2_to_int(num_contig_lg2 + item_lg2sz -
|
|
* PT_GRANULE_LG2SZ - 1) - 1
|
|
* Which can be expressed as:
|
|
* num_contig_lg2 = oalog2_ffz(code) + 1 -
|
|
* item_lg2sz - PT_GRANULE_LG2SZ
|
|
*
|
|
* Assume the bit layout is correct and remove the masking. Reorganize
|
|
* the equation to move all the arithmetic before the ffz.
|
|
*/
|
|
code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 +
|
|
pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ);
|
|
return ffz_t(u32, code);
|
|
}
|
|
#define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2
|
|
|
|
static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts)
|
|
{
|
|
/*
|
|
* Top entry covers bits [63:57] only, this is handled through
|
|
* max_vasz_lg2.
|
|
*/
|
|
if (PT_WARN_ON(pts->level == 5))
|
|
return 7;
|
|
return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64));
|
|
}
|
|
#define pt_num_items_lg2 amdv1pt_num_items_lg2
|
|
|
|
static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts)
|
|
{
|
|
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
|
|
|
|
if (!amdv1pt_can_have_leaf(pts))
|
|
return 0;
|
|
|
|
/*
|
|
* Table 14: Example Page Size Encodings
|
|
* Address bits 51:32 can be used to encode page sizes greater than 4
|
|
* Gbytes. Address bits 63:52 are zero-extended.
|
|
*
|
|
* 512GB Pages are not supported due to a hardware bug.
|
|
* Otherwise every power of two size is supported.
|
|
*/
|
|
return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1),
|
|
isz_lg2) & ~SZ_512G;
|
|
}
|
|
#define pt_possible_sizes amdv1pt_possible_sizes
|
|
|
|
static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts)
|
|
{
|
|
const u64 *tablep = pt_cur_table(pts, u64) + pts->index;
|
|
unsigned int next_level;
|
|
u64 entry;
|
|
|
|
pts->entry = entry = READ_ONCE(*tablep);
|
|
if (!(entry & AMDV1PT_FMT_PR))
|
|
return PT_ENTRY_EMPTY;
|
|
|
|
next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry);
|
|
if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT ||
|
|
next_level == AMDV1PT_FMT_NL_SIZE)
|
|
return PT_ENTRY_OA;
|
|
return PT_ENTRY_TABLE;
|
|
}
|
|
#define pt_load_entry_raw amdv1pt_load_entry_raw
|
|
|
|
static inline void
|
|
amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa,
|
|
unsigned int oasz_lg2,
|
|
const struct pt_write_attrs *attrs)
|
|
{
|
|
unsigned int isz_lg2 = pt_table_item_lg2sz(pts);
|
|
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
|
|
u64 entry;
|
|
|
|
if (!pt_check_install_leaf_args(pts, oa, oasz_lg2))
|
|
return;
|
|
|
|
entry = AMDV1PT_FMT_PR |
|
|
FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) |
|
|
attrs->descriptor_bits;
|
|
|
|
if (oasz_lg2 == isz_lg2) {
|
|
entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
|
|
AMDV1PT_FMT_NL_DEFAULT);
|
|
WRITE_ONCE(*tablep, entry);
|
|
} else {
|
|
unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2;
|
|
u64 *end = tablep + log2_to_int(num_contig_lg2);
|
|
|
|
entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL,
|
|
AMDV1PT_FMT_NL_SIZE) |
|
|
FIELD_PREP(AMDV1PT_FMT_OA,
|
|
oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ -
|
|
1) -
|
|
1);
|
|
|
|
/* See amdv1pt_clear_entries() */
|
|
if (num_contig_lg2 <= ilog2(32)) {
|
|
for (; tablep != end; tablep++)
|
|
WRITE_ONCE(*tablep, entry);
|
|
} else {
|
|
memset64(tablep, entry, log2_to_int(num_contig_lg2));
|
|
}
|
|
}
|
|
pts->entry = entry;
|
|
}
|
|
#define pt_install_leaf_entry amdv1pt_install_leaf_entry
|
|
|
|
static inline bool amdv1pt_install_table(struct pt_state *pts,
|
|
pt_oaddr_t table_pa,
|
|
const struct pt_write_attrs *attrs)
|
|
{
|
|
u64 entry;
|
|
|
|
/*
|
|
* IR and IW are ANDed from the table levels along with the PTE. We
|
|
* always control permissions from the PTE, so always set IR and IW for
|
|
* tables.
|
|
*/
|
|
entry = AMDV1PT_FMT_PR |
|
|
FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) |
|
|
FIELD_PREP(AMDV1PT_FMT_OA,
|
|
log2_div(table_pa, PT_GRANULE_LG2SZ)) |
|
|
AMDV1PT_FMT_IR | AMDV1PT_FMT_IW;
|
|
if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES))
|
|
entry = __sme_set(entry);
|
|
return pt_table_install64(pts, entry);
|
|
}
|
|
#define pt_install_table amdv1pt_install_table
|
|
|
|
static inline void amdv1pt_attr_from_entry(const struct pt_state *pts,
|
|
struct pt_write_attrs *attrs)
|
|
{
|
|
attrs->descriptor_bits =
|
|
pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW);
|
|
}
|
|
#define pt_attr_from_entry amdv1pt_attr_from_entry
|
|
|
|
static inline void amdv1pt_clear_entries(struct pt_state *pts,
|
|
unsigned int num_contig_lg2)
|
|
{
|
|
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
|
|
u64 *end = tablep + log2_to_int(num_contig_lg2);
|
|
|
|
/*
|
|
* gcc generates rep stos for the io-pgtable code, and this difference
|
|
* can show in microbenchmarks with larger contiguous page sizes.
|
|
* rep is slower for small cases.
|
|
*/
|
|
if (num_contig_lg2 <= ilog2(32)) {
|
|
for (; tablep != end; tablep++)
|
|
WRITE_ONCE(*tablep, 0);
|
|
} else {
|
|
memset64(tablep, 0, log2_to_int(num_contig_lg2));
|
|
}
|
|
}
|
|
#define pt_clear_entries amdv1pt_clear_entries
|
|
|
|
static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts)
|
|
{
|
|
unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
|
|
u64 *tablep = pt_cur_table(pts, u64) +
|
|
log2_set_mod(pts->index, 0, num_contig_lg2);
|
|
u64 *end = tablep + log2_to_int(num_contig_lg2);
|
|
|
|
for (; tablep != end; tablep++)
|
|
if (READ_ONCE(*tablep) & AMDV1PT_FMT_D)
|
|
return true;
|
|
return false;
|
|
}
|
|
#define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty
|
|
|
|
static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts)
|
|
{
|
|
unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts);
|
|
u64 *tablep = pt_cur_table(pts, u64) +
|
|
log2_set_mod(pts->index, 0, num_contig_lg2);
|
|
u64 *end = tablep + log2_to_int(num_contig_lg2);
|
|
|
|
for (; tablep != end; tablep++)
|
|
WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D);
|
|
}
|
|
#define pt_entry_make_write_clean amdv1pt_entry_make_write_clean
|
|
|
|
static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts)
|
|
{
|
|
u64 *tablep = pt_cur_table(pts, u64) + pts->index;
|
|
u64 new = pts->entry | AMDV1PT_FMT_D;
|
|
|
|
return try_cmpxchg64(tablep, &pts->entry, new);
|
|
}
|
|
#define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty
|
|
|
|
/* --- iommu */
|
|
#include <linux/generic_pt/iommu.h>
|
|
#include <linux/iommu.h>
|
|
|
|
#define pt_iommu_table pt_iommu_amdv1
|
|
|
|
/* The common struct is in the per-format common struct */
|
|
static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table)
|
|
{
|
|
return &container_of(iommu_table, struct pt_iommu_amdv1, iommu)
|
|
->amdpt.common;
|
|
}
|
|
|
|
static inline struct pt_iommu *iommu_from_common(struct pt_common *common)
|
|
{
|
|
return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu;
|
|
}
|
|
|
|
static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
|
|
struct pt_write_attrs *attrs,
|
|
unsigned int iommu_prot)
|
|
{
|
|
u64 pte = 0;
|
|
|
|
if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE))
|
|
pte |= AMDV1PT_FMT_FC;
|
|
if (iommu_prot & IOMMU_READ)
|
|
pte |= AMDV1PT_FMT_IR;
|
|
if (iommu_prot & IOMMU_WRITE)
|
|
pte |= AMDV1PT_FMT_IW;
|
|
|
|
/*
|
|
* Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
|
|
* control this. For now if the tables use sme_set then so do the ptes.
|
|
*/
|
|
if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) &&
|
|
!(iommu_prot & IOMMU_MMIO))
|
|
pte = __sme_set(pte);
|
|
|
|
attrs->descriptor_bits = pte;
|
|
return 0;
|
|
}
|
|
#define pt_iommu_set_prot amdv1pt_iommu_set_prot
|
|
|
|
static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table,
|
|
const struct pt_iommu_amdv1_cfg *cfg)
|
|
{
|
|
struct pt_amdv1 *table = &iommu_table->amdpt;
|
|
unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2;
|
|
|
|
if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL)
|
|
return -EINVAL;
|
|
|
|
if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) &&
|
|
cfg->starting_level != PT_MAX_TOP_LEVEL)
|
|
max_vasz_lg2 = PT_GRANULE_LG2SZ +
|
|
(PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) *
|
|
(cfg->starting_level + 1);
|
|
|
|
table->common.max_vasz_lg2 =
|
|
min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2);
|
|
table->common.max_oasz_lg2 =
|
|
min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2);
|
|
pt_top_set_level(&table->common, cfg->starting_level);
|
|
return 0;
|
|
}
|
|
#define pt_iommu_fmt_init amdv1pt_iommu_fmt_init
|
|
|
|
#ifndef PT_FMT_VARIANT
|
|
static inline void
|
|
amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table,
|
|
const struct pt_range *top_range,
|
|
struct pt_iommu_amdv1_hw_info *info)
|
|
{
|
|
info->host_pt_root = virt_to_phys(top_range->top_table);
|
|
PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK);
|
|
info->mode = top_range->top_level + 1;
|
|
}
|
|
#define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info
|
|
#endif
|
|
|
|
#if defined(GENERIC_PT_KUNIT)
|
|
static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = {
|
|
/* Matches what io_pgtable does */
|
|
[0] = { .starting_level = 2 },
|
|
};
|
|
#define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs
|
|
enum { KUNIT_FMT_FEATURES = 0 };
|
|
#endif
|
|
|
|
#endif
|