From 6f770b73d0311a5b099277653199bb6421c4fed2 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 15 Mar 2026 17:27:49 +0900 Subject: [PATCH 01/10] dma: swiotlb: add KMSAN annotations to swiotlb_bounce() When a device performs DMA to a bounce buffer, KMSAN is unaware of the write and does not mark the data as initialized. When swiotlb_bounce() later copies the bounce buffer back to the original buffer, memcpy propagates the uninitialized shadow to the original buffer, causing false positive uninit-value reports. Fix this by calling kmsan_unpoison_memory() on the bounce buffer before copying it back in the DMA_FROM_DEVICE path, so that memcpy naturally propagates initialized shadow to the destination. Suggested-by: Alexander Potapenko Link: https://lore.kernel.org/CAG_fn=WUGta-paG1BgsGRoAR+fmuCgh3xo=R3XdzOt_-DqSdHw@mail.gmail.com/ Fixes: 7ade4f10779c ("dma: kmsan: unpoison DMA mappings") Signed-off-by: Shigeru Yoshida Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260315082750.2375581-1-syoshida@redhat.com --- kernel/dma/swiotlb.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..9fd73700ddcf 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -901,10 +902,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size local_irq_save(flags); page = pfn_to_page(pfn); - if (dir == DMA_TO_DEVICE) + if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_highmem_page() + * could be used here to detect infoleaks, + * but callers may map uninitialized buffers + * that will be written by the device, + * causing false positives. + */ memcpy_from_page(vaddr, page, offset, sz); - else + } else { + kmsan_unpoison_memory(vaddr, sz); memcpy_to_page(page, offset, vaddr, sz); + } local_irq_restore(flags); size -= sz; @@ -913,8 +923,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size offset = 0; } } else if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_memory() could be used here to detect + * infoleaks (uninitialized data being sent to device), but + * callers may map uninitialized buffers that will be written + * by the device, causing false positives. + */ memcpy(vaddr, phys_to_virt(orig_addr), size); } else { + kmsan_unpoison_memory(vaddr, size); memcpy(phys_to_virt(orig_addr), vaddr, size); } } From eca58535b154e6951327319afda94ac80eae7dc3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:45 +0200 Subject: [PATCH 02/10] dma-debug: Allow multiple invocations of overlapping entries Repeated DMA mappings with DMA_ATTR_CPU_CACHE_CLEAN trigger the following splat. This prevents using the attribute in cases where a DMA region is shared and reused more than seven times. ------------[ cut here ]------------ DMA-API: exceeded 7 overlapping mappings of cacheline 0x000000000438c440 WARNING: kernel/dma/debug.c:467 at add_dma_entry+0x219/0x280, CPU#4: ibv_rc_pingpong/1644 Modules linked in: xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core CPU: 4 UID: 2733 PID: 1644 Comm: ibv_rc_pingpong Not tainted 6.19.0+ #129 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:add_dma_entry+0x221/0x280 Code: c0 0f 84 f2 fe ff ff 83 e8 01 89 05 6d 99 11 01 e9 e4 fe ff ff 0f 8e 1f ff ff ff 48 8d 3d 07 ef 2d 01 be 07 00 00 00 48 89 e2 <67> 48 0f b9 3a e9 06 ff ff ff 48 c7 c7 98 05 2b 82 c6 05 72 92 28 RSP: 0018:ff1100010e657970 EFLAGS: 00010002 RAX: 0000000000000007 RBX: ff1100010234eb00 RCX: 0000000000000000 RDX: ff1100010e657970 RSI: 0000000000000007 RDI: ffffffff82678660 RBP: 000000000438c440 R08: 0000000000000228 R09: 0000000000000000 R10: 00000000000001be R11: 000000000000089d R12: 0000000000000800 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010234eb00 FS: 00007fb15f3f6740(0000) GS:ff110008dcc19000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb15f32d3a0 CR3: 0000000116f59001 CR4: 0000000000373eb0 Call Trace: debug_dma_map_sg+0x1b4/0x390 __dma_map_sg_attrs+0x6d/0x1a0 dma_map_sgtable+0x19/0x30 ib_umem_get+0x284/0x3b0 [ib_uverbs] mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib] ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs] ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs] ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs] ? mmap_region+0x7a/0xb0 ? do_mmap+0x3b8/0x5c0 ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs] __x64_sys_ioctl+0x14f/0x8b0 ? ksys_mmap_pgoff+0xc5/0x190 do_syscall_64+0x8c/0xbf0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fb15f5e4eed Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:00007ffe09a5c540 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe09a5c5d0 RCX: 00007fb15f5e4eed RDX: 00007ffe09a5c5f0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffe09a5c590 R08: 0000000000000028 R09: 00007ffe09a5c794 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe09a5c794 R13: 000000000000000c R14: 0000000025a49170 R15: 000000000000000c ---[ end trace 0000000000000000 ]--- Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN") Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-1-1dde90a7f08b@nvidia.com --- kernel/dma/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 86f87e43438c..be207be74996 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) return overlap; } -static void active_cacheline_inc_overlap(phys_addr_t cln) +static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean) { int overlap = active_cacheline_read_overlap(cln); @@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) /* If we overflowed the overlap counter then we're potentially * leaking dma-mappings. */ - WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), ACTIVE_CACHELINE_MAX_OVERLAP, &cln); } @@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry, if (rc == -EEXIST) { struct dma_debug_entry *existing; - active_cacheline_inc_overlap(cln); + active_cacheline_inc_overlap(cln, entry->is_cache_clean); existing = radix_tree_lookup(&dma_active_cacheline, cln); /* A lookup failure here after we got -EEXIST is unexpected. */ WARN_ON(!existing); From 6f45b1604cf43945ef472ae4ef30354025307c19 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:46 +0200 Subject: [PATCH 03/10] dma-mapping: handle DMA_ATTR_CPU_CACHE_CLEAN in trace output Tracing prints decoded DMA attribute flags, but it does not yet include the recently added DMA_ATTR_CPU_CACHE_CLEAN. Add support for decoding and displaying this attribute in the trace output. Fixes: 61868dc55a11 ("dma-mapping: add DMA_ATTR_CPU_CACHE_CLEAN") Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-2-1dde90a7f08b@nvidia.com --- include/trace/events/dma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 33e99e792f1a..69cb3805ee81 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -32,7 +32,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_ALLOC_SINGLE_PAGES, "ALLOC_SINGLE_PAGES" }, \ { DMA_ATTR_NO_WARN, "NO_WARN" }, \ { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ - { DMA_ATTR_MMIO, "MMIO" }) + { DMA_ATTR_MMIO, "MMIO" }, \ + { DMA_ATTR_CPU_CACHE_CLEAN, "CACHE_CLEAN" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, From 9bb0a4d6a4433b75274204b083dac8e515d2007d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:47 +0200 Subject: [PATCH 04/10] dma-mapping: Clarify valid conditions for CPU cache line overlap Rename the DMA_ATTR_CPU_CACHE_CLEAN attribute to better reflect that it is debugging aid to inform DMA core code that CPU cache line overlaps are allowed, and refine the documentation describing its use. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-3-1dde90a7f08b@nvidia.com --- Documentation/core-api/dma-attributes.rst | 20 +++++++++++++------- drivers/virtio/virtio_ring.c | 10 +++++----- include/linux/dma-mapping.h | 8 ++++---- include/trace/events/dma.h | 2 +- kernel/dma/debug.c | 2 +- 5 files changed, 24 insertions(+), 18 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1d7bfad73b1c..48cfe86cc06d 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -149,11 +149,17 @@ For architectures that require cache flushing for DMA coherence DMA_ATTR_MMIO will not perform any cache flushing. The address provided must never be mapped cacheable into the CPU. -DMA_ATTR_CPU_CACHE_CLEAN ------------------------- +DMA_ATTR_DEBUGGING_IGNORE_CACHELINES +------------------------------------ -This attribute indicates the CPU will not dirty any cacheline overlapping this -DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows -multiple small buffers to safely share a cacheline without risk of data -corruption, suppressing DMA debug warnings about overlapping mappings. -All mappings sharing a cacheline should have this attribute. +This attribute indicates that CPU cache lines may overlap for buffers mapped +with DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + +Such overlap may occur when callers map multiple small buffers that reside +within the same cache line. In this case, callers must guarantee that the CPU +will not dirty these cache lines after the mappings are established. When this +condition is met, multiple buffers can safely share a cache line without risking +data corruption. + +All mappings that share a cache line must set this attribute to suppress DMA +debug warnings about overlapping mappings. diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 335692d41617..fbca7ce1c6bf 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2912,10 +2912,10 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * - * Same as virtqueue_add_inbuf but passes DMA_ATTR_CPU_CACHE_CLEAN to indicate - * that the CPU will not dirty any cacheline overlapping this buffer while it - * is available, and to suppress overlapping cacheline warnings in DMA debug - * builds. + * Same as virtqueue_add_inbuf but passes DMA_ATTR_DEBUGGING_IGNORE_CACHELINES + * to indicate that the CPU will not dirty any cacheline overlapping this buffer + * while it is available, and to suppress overlapping cacheline warnings in DMA + * debug builds. * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). @@ -2928,7 +2928,7 @@ int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, - DMA_ATTR_CPU_CACHE_CLEAN); + DMA_ATTR_DEBUGGING_IGNORE_CACHELINES); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_cache_clean); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 29973baa0581..da44394b3a1a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -80,11 +80,11 @@ #define DMA_ATTR_MMIO (1UL << 10) /* - * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline - * overlapping this buffer while it is mapped for DMA. All mappings sharing - * a cacheline must have this attribute for this to be considered safe. + * DMA_ATTR_DEBUGGING_IGNORE_CACHELINES: Indicates the CPU cache line can be + * overlapped. All mappings sharing a cacheline must have this attribute for + * this to be considered safe. */ -#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11) +#define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 69cb3805ee81..8c64bc0721fe 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -33,7 +33,7 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_NO_WARN, "NO_WARN" }, \ { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ { DMA_ATTR_MMIO, "MMIO" }, \ - { DMA_ATTR_CPU_CACHE_CLEAN, "CACHE_CLEAN" }) + { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index be207be74996..83e1cfe05f08 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); From e6a58fa2556203a7f6731b4071705dc81cca5ca5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:48 +0200 Subject: [PATCH 05/10] dma-mapping: Introduce DMA require coherency attribute The mapping buffers which carry this attribute require DMA coherent system. This means that they can't take SWIOTLB path, can perform CPU cache overlap and doesn't perform cache flushing. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-4-1dde90a7f08b@nvidia.com --- Documentation/core-api/dma-attributes.rst | 16 ++++++++++++++++ include/linux/dma-mapping.h | 7 +++++++ include/trace/events/dma.h | 3 ++- kernel/dma/debug.c | 3 ++- kernel/dma/mapping.c | 6 ++++++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 48cfe86cc06d..123c8468d58f 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -163,3 +163,19 @@ data corruption. All mappings that share a cache line must set this attribute to suppress DMA debug warnings about overlapping mappings. + +DMA_ATTR_REQUIRE_COHERENT +------------------------- + +DMA mapping requests with the DMA_ATTR_REQUIRE_COHERENT fail on any +system where SWIOTLB or cache management is required. This should only +be used to support uAPI designs that require continuous HW DMA +coherence with userspace processes, for example RDMA and DRM. At a +minimum the memory being mapped must be userspace memory from +pin_user_pages() or similar. + +Drivers should consider using dma_mmap_pages() instead of this +interface when building their uAPIs, when possible. + +It must never be used in an in-kernel driver that only works with +kernel memory. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index da44394b3a1a..482b919f040f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -86,6 +86,13 @@ */ #define DMA_ATTR_DEBUGGING_IGNORE_CACHELINES (1UL << 11) +/* + * DMA_ATTR_REQUIRE_COHERENT: Indicates that DMA coherency is required. + * All mappings that carry this attribute can't work with SWIOTLB and cache + * flushing. + */ +#define DMA_ATTR_REQUIRE_COHERENT (1UL << 12) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 8c64bc0721fe..63597b004424 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -33,7 +33,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_NO_WARN, "NO_WARN" }, \ { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ { DMA_ATTR_MMIO, "MMIO" }, \ - { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }) + { DMA_ATTR_DEBUGGING_IGNORE_CACHELINES, "CACHELINES_OVERLAP" }, \ + { DMA_ATTR_REQUIRE_COHERENT, "REQUIRE_COHERENT" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 83e1cfe05f08..0677918f06a8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = attrs & DMA_ATTR_DEBUGGING_IGNORE_CACHELINES; + entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES | + DMA_ATTR_REQUIRE_COHERENT); bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3928a509c44c..6d3dd0bd3a88 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -164,6 +164,9 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops) || (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) addr = dma_direct_map_phys(dev, phys, size, dir, attrs); @@ -235,6 +238,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return -EOPNOTSUPP; + if (WARN_ON_ONCE(!dev->dma_mask)) return 0; From 2536617f20ddc7c2f4cef59b549aa45d166b03b1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:49 +0200 Subject: [PATCH 06/10] dma-direct: prevent SWIOTLB path when DMA_ATTR_REQUIRE_COHERENT is set DMA_ATTR_REQUIRE_COHERENT indicates that SWIOTLB must not be used. Ensure the SWIOTLB path is declined whenever the DMA direct path is selected. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-5-1dde90a7f08b@nvidia.com --- kernel/dma/direct.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e89f175e9c2d..6184ff303f08 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -84,7 +84,7 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); @@ -98,7 +98,8 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, dma_addr = phys_to_dma(dev, phys); if (unlikely(!dma_capable(dev, dma_addr, size, true)) || dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_swiotlb_active(dev)) + if (is_swiotlb_active(dev) && + !(attrs & DMA_ATTR_REQUIRE_COHERENT)) return swiotlb_map(dev, phys, size, dir, attrs); goto err_overflow; @@ -123,7 +124,7 @@ static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, { phys_addr_t phys; - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) /* nothing to do: uncached and no swiotlb */ return; From 636e6572e848339d2ae591949fe81de2cef00563 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:50 +0200 Subject: [PATCH 07/10] iommu/dma: add support for DMA_ATTR_REQUIRE_COHERENT attribute Add support for the DMA_ATTR_REQUIRE_COHERENT attribute to the exported functions. This attribute indicates that the SWIOTLB path must not be used and that no sync operations should be performed. Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-6-1dde90a7f08b@nvidia.com --- drivers/iommu/dma-iommu.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 5dac64be61bb..94d514169642 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1211,7 +1211,7 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, */ if (dev_use_swiotlb(dev, size, dir) && iova_unaligned(iovad, phys, size)) { - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) return DMA_MAPPING_ERROR; phys = iommu_dma_map_swiotlb(dev, phys, size, dir, attrs); @@ -1223,7 +1223,8 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, arch_sync_dma_for_device(phys, size, dir); iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) + if (iova == DMA_MAPPING_ERROR && + !(attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); return iova; } @@ -1233,7 +1234,7 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, { phys_addr_t phys; - if (attrs & DMA_ATTR_MMIO) { + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) { __iommu_dma_unmap(dev, dma_handle, size); return; } @@ -1945,9 +1946,21 @@ int dma_iova_link(struct device *dev, struct dma_iova_state *state, if (WARN_ON_ONCE(iova_start_pad && offset > 0)) return -EIO; + /* + * DMA_IOVA_USE_SWIOTLB is set on state after some entry + * took SWIOTLB path, which we were supposed to prevent + * for DMA_ATTR_REQUIRE_COHERENT attribute. + */ + if (WARN_ON_ONCE((state->__size & DMA_IOVA_USE_SWIOTLB) && + (attrs & DMA_ATTR_REQUIRE_COHERENT))) + return -EOPNOTSUPP; + + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return -EOPNOTSUPP; + if (dev_use_swiotlb(dev, size, dir) && iova_unaligned(iovad, phys, size)) { - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) return -EPERM; return iommu_dma_iova_link_swiotlb(dev, state, phys, offset, From d9d43a3f5c48d5a3d1da922f46c4a30d94d61ba5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:51 +0200 Subject: [PATCH 08/10] RDMA/umem: Tell DMA mapping that UMEM requires coherency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RDMA subsystem exposes DMA regions through the verbs interface, which assumes a coherent system. Use the DMA_ATTR_REQUIRE_COHERENCE attribute to ensure coherency and avoid taking the SWIOTLB path. The RDMA verbs programming model resembles HMM and assumes concurrent DMA and CPU access to userspace memory. The hardware and programming model support "one-sided" operations initiated remotely without any local CPU involvement or notification. These include ATOMIC compare/swap, READ, and WRITE. A remote CPU can use these operations to traverse data structures, manipulate locks, and perform similar tasks without the host CPU’s awareness. If SWIOTLB substitutes memory or DMA is not cache coherent, these use cases break entirely. In-kernel RDMA is fine with incoherent mappings because kernel users do not rely on one-sided operations in ways that would expose these issues. A given region may also be exported multiple times, which can trigger warnings about cacheline overlaps. These warnings are suppressed when the new attribute is used. infiniband rocep8s0f0: mlx5_ib_reg_user_mr:1592:(pid 5812): start 0x2b28c000, iova 0x2b28c000, length 0x1000, access_flags 0x1 infiniband rocep8s0f0: mlx5_ib_reg_user_mr:1592:(pid 5812): start 0x2b28c001, iova 0x2b28c001, length 0xfff, access_flags 0x1 ------------[ cut here ]------------ DMA-API: mlx5_core 0000:08:00.0: cacheline tracking EEXIST, overlapping mappings aren't supported WARNING: kernel/dma/debug.c:620 at add_dma_entry+0x1bb/0x280, CPU#6: ibv_rc_pingpong/5812 Modules linked in: veth xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay mlx5_fwctl zram zsmalloc mlx5_ib fuse rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_core ib_core CPU: 6 UID: 2733 PID: 5812 Comm: ibv_rc_pingpong Tainted: G W 6.19.0+ #129 PREEMPT Tainted: [W]=WARN Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:add_dma_entry+0x1be/0x280 Code: 8b 7b 10 48 85 ff 0f 84 c3 00 00 00 48 8b 6f 50 48 85 ed 75 03 48 8b 2f e8 ff 8e 6a 00 48 89 c6 48 8d 3d 55 ef 2d 01 48 89 ea <67> 48 0f b9 3a 48 85 db 74 1a 48 c7 c7 b0 00 2b 82 e8 9c 25 fd ff RSP: 0018:ff11000138717978 EFLAGS: 00010286 RAX: ffffffffa02d7831 RBX: ff1100010246de00 RCX: 0000000000000000 RDX: ff110001036fac30 RSI: ffffffffa02d7831 RDI: ffffffff82678650 RBP: ff110001036fac30 R08: ff11000110dcb4a0 R09: ff11000110dcb478 R10: 0000000000000000 R11: ffffffff824b30a8 R12: 0000000000000000 R13: 00000000ffffffef R14: 0000000000000202 R15: ff1100010246de00 FS: 00007f59b411c740(0000) GS:ff110008dcc99000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffe538f7000 CR3: 000000010e066005 CR4: 0000000000373eb0 Call Trace: debug_dma_map_sg+0x1b4/0x390 __dma_map_sg_attrs+0x6d/0x1a0 dma_map_sgtable+0x19/0x30 ib_umem_get+0x254/0x380 [ib_uverbs] mlx5_ib_reg_user_mr+0x68/0x2a0 [mlx5_ib] ib_uverbs_reg_mr+0x17f/0x2a0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc2/0x130 [ib_uverbs] ib_uverbs_cmd_verbs+0xa0b/0xae0 [ib_uverbs] ? ib_uverbs_handler_UVERBS_METHOD_QUERY_PORT_SPEED+0xe0/0xe0 [ib_uverbs] ? mmap_region+0x7a/0xb0 ? do_mmap+0x3b8/0x5c0 ib_uverbs_ioctl+0xa7/0x110 [ib_uverbs] __x64_sys_ioctl+0x14f/0x8b0 ? ksys_mmap_pgoff+0xc5/0x190 do_syscall_64+0x8c/0xbf0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f59b430aeed Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:00007ffe538f9430 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffe538f94c0 RCX: 00007f59b430aeed RDX: 00007ffe538f94e0 RSI: 00000000c0181b01 RDI: 0000000000000003 RBP: 00007ffe538f9480 R08: 0000000000000028 R09: 00007ffe538f9684 R10: 0000000000000001 R11: 0000000000000246 R12: 00007ffe538f9684 R13: 000000000000000c R14: 000000002b28d170 R15: 000000000000000c ---[ end trace 0000000000000000 ]--- Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-7-1dde90a7f08b@nvidia.com --- drivers/infiniband/core/umem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index cff4fcca2c34..edc34c69f0f2 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -55,7 +55,8 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d if (dirty) ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt, - DMA_BIDIRECTIONAL, 0); + DMA_BIDIRECTIONAL, + DMA_ATTR_REQUIRE_COHERENT); for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i) { unpin_user_page_range_dirty_lock(sg_page(sg), @@ -169,7 +170,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, unsigned long lock_limit; unsigned long new_pinned; unsigned long cur_base; - unsigned long dma_attr = 0; + unsigned long dma_attr = DMA_ATTR_REQUIRE_COHERENT; struct mm_struct *mm; unsigned long npages; int pinned, ret; From f5ebf241c407dbf629fcf515015e139fcea2c2f0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 16 Mar 2026 21:06:52 +0200 Subject: [PATCH 09/10] mm/hmm: Indicate that HMM requires DMA coherency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM is fundamentally about allowing a sophisticated device to perform DMA directly to a process’s memory while the CPU accesses that same memory at the same time. It is similar to SVA but does not rely on IOMMU support. Because the entire model depends on concurrent access to shared memory, it fails as a uAPI if SWIOTLB substitutes the memory or if the CPU caches are not coherent with DMA. Until now, there has been no reliable way to report this, and various approximations have been used: int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map, size_t nr_entries, size_t dma_entry_size) { <...> /* * The HMM API violates our normal DMA buffer ownership rules and can't * transfer buffer ownership. The dma_addressing_limited() check is a * best approximation to ensure no swiotlb buffering happens. */ dma_need_sync = !dev->dma_skip_sync; if (dma_need_sync || dma_addressing_limited(dev)) return -EOPNOTSUPP; So let's mark mapped buffers with DMA_ATTR_REQUIRE_COHERENT attribute to prevent silent data corruption if someone tries to use hmm in a system with swiotlb or incoherent DMA Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260316-dma-debug-overlap-v3-8-1dde90a7f08b@nvidia.com --- mm/hmm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index f6c4ddff4bd6..5955f2f0c83d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -778,7 +778,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, struct page *page = hmm_pfn_to_page(pfns[idx]); phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]); size_t offset = idx * map->dma_entry_size; - unsigned long attrs = 0; + unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT; dma_addr_t dma_addr; int ret; @@ -871,7 +871,7 @@ bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx) struct dma_iova_state *state = &map->state; dma_addr_t *dma_addrs = map->dma_list; unsigned long *pfns = map->pfn_list; - unsigned long attrs = 0; + unsigned long attrs = DMA_ATTR_REQUIRE_COHERENT; if ((pfns[idx] & valid_dma) != valid_dma) return false; From 2cdaff22ed26f1e619aa2b43f27bb84f2c6ef8f8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 25 Mar 2026 02:55:48 +0100 Subject: [PATCH 10/10] dma-mapping: add missing `inline` for `dma_free_attrs` Under an UML build for an upcoming series [1], I got `-Wstatic-in-inline` for `dma_free_attrs`: BINDGEN rust/bindings/bindings_generated.rs - due to target missing In file included from rust/helpers/helpers.c:59: rust/helpers/dma.c:17:2: warning: static function 'dma_free_attrs' is used in an inline function with external linkage [-Wstatic-in-inline] 17 | dma_free_attrs(dev, size, cpu_addr, dma_handle, attrs); | ^ rust/helpers/dma.c:12:1: note: use 'static' to give inline function 'rust_helper_dma_free_attrs' internal linkage 12 | __rust_helper void rust_helper_dma_free_attrs(struct device *dev, size_t size, | ^ | static The issue is that `dma_free_attrs` was not marked `inline` when it was introduced alongside the rest of the stubs. Thus mark it. Fixes: ed6ccf10f24b ("dma-mapping: properly stub out the DMA API for !CONFIG_HAS_DMA") Closes: https://lore.kernel.org/rust-for-linux/20260322194616.89847-1-ojeda@kernel.org/ [1] Signed-off-by: Miguel Ojeda Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260325015548.70912-1-ojeda@kernel.org --- include/linux/dma-mapping.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 482b919f040f..99ef042ecdb4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -255,8 +255,8 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size, { return NULL; } -static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle, unsigned long attrs) +static inline void dma_free_attrs(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size,