Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma (25f48746) · Commits · git / linux-nf

arch/arm64/include/asm/io.h

+132 −0

Original line number	Diff line number	Diff line
		@@ -139,6 +139,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
		#define memcpy_fromio(a,c,l) __memcpy_fromio((a),(c),(l))
		#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l))

		/*
		* The ARM64 iowrite implementation is intended to support drivers that want to
		* use write combining. For instance PCI drivers using write combining with a 64
		* byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
		*
		* Newer ARM core have sensitive write combining buffers, it is important that
		* the stores be contiguous blocks of store instructions. Normal memcpy
		* approaches have a very low chance to generate write combining.
		*
		* Since this is the only API on ARM64 that should be used with write combining
		* it also integrates the DGH hint which is supposed to lower the latency to
		* emit the large TLP from the CPU.
		*/

		static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
		const u32 *from, size_t count)
		{
		switch (count) {
		case 8:
		asm volatile("str %w0, [%8, #4 * 0]\n"
		"str %w1, [%8, #4 * 1]\n"
		"str %w2, [%8, #4 * 2]\n"
		"str %w3, [%8, #4 * 3]\n"
		"str %w4, [%8, #4 * 4]\n"
		"str %w5, [%8, #4 * 5]\n"
		"str %w6, [%8, #4 * 6]\n"
		"str %w7, [%8, #4 * 7]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
		"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
		"rZ"(from[6]), "rZ"(from[7]), "r"(to));
		break;
		case 4:
		asm volatile("str %w0, [%4, #4 * 0]\n"
		"str %w1, [%4, #4 * 1]\n"
		"str %w2, [%4, #4 * 2]\n"
		"str %w3, [%4, #4 * 3]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
		"rZ"(from[3]), "r"(to));
		break;
		case 2:
		asm volatile("str %w0, [%2, #4 * 0]\n"
		"str %w1, [%2, #4 * 1]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
		break;
		case 1:
		__raw_writel(*from, to);
		break;
		default:
		BUILD_BUG();
		}
		}

		void __iowrite32_copy_full(void __iomem to, const void from, size_t count);

		static inline void __const_iowrite32_copy(void __iomem to, const void from,
		size_t count)
		{
		if (count == 8 \|\| count == 4 \|\| count == 2 \|\| count == 1) {
		__const_memcpy_toio_aligned32(to, from, count);
		dgh();
		} else {
		__iowrite32_copy_full(to, from, count);
		}
		}

		#define __iowrite32_copy(to, from, count) \
		(__builtin_constant_p(count) ? \
		__const_iowrite32_copy(to, from, count) : \
		__iowrite32_copy_full(to, from, count))

		static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
		const u64 *from, size_t count)
		{
		switch (count) {
		case 8:
		asm volatile("str %x0, [%8, #8 * 0]\n"
		"str %x1, [%8, #8 * 1]\n"
		"str %x2, [%8, #8 * 2]\n"
		"str %x3, [%8, #8 * 3]\n"
		"str %x4, [%8, #8 * 4]\n"
		"str %x5, [%8, #8 * 5]\n"
		"str %x6, [%8, #8 * 6]\n"
		"str %x7, [%8, #8 * 7]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
		"rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
		"rZ"(from[6]), "rZ"(from[7]), "r"(to));
		break;
		case 4:
		asm volatile("str %x0, [%4, #8 * 0]\n"
		"str %x1, [%4, #8 * 1]\n"
		"str %x2, [%4, #8 * 2]\n"
		"str %x3, [%4, #8 * 3]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
		"rZ"(from[3]), "r"(to));
		break;
		case 2:
		asm volatile("str %x0, [%2, #8 * 0]\n"
		"str %x1, [%2, #8 * 1]\n"
		:
		: "rZ"(from[0]), "rZ"(from[1]), "r"(to));
		break;
		case 1:
		__raw_writeq(*from, to);
		break;
		default:
		BUILD_BUG();
		}
		}

		void __iowrite64_copy_full(void __iomem to, const void from, size_t count);

		static inline void __const_iowrite64_copy(void __iomem to, const void from,
		size_t count)
		{
		if (count == 8 \|\| count == 4 \|\| count == 2 \|\| count == 1) {
		__const_memcpy_toio_aligned64(to, from, count);
		dgh();
		} else {
		__iowrite64_copy_full(to, from, count);
		}
		}

		#define __iowrite64_copy(to, from, count) \
		(__builtin_constant_p(count) ? \
		__const_iowrite64_copy(to, from, count) : \
		__iowrite64_copy_full(to, from, count))

		/*
		* I/O memory mapping functions.
		*/

arch/arm64/kernel/io.c

+42 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,48 @@ void __memcpy_fromio(void to, const volatile void __iomem from, size_t count)
		}
		EXPORT_SYMBOL(__memcpy_fromio);

		/*
		* This generates a memcpy that works on a from/to address which is aligned to
		* bits. Count is in terms of the number of bits sized quantities to copy. It
		* optimizes to use the STR groupings when possible so that it is WC friendly.
		*/
		#define memcpy_toio_aligned(to, from, count, bits) \
		({ \
		volatile u##bits __iomem *_to = to; \
		const u##bits *_from = from; \
		size_t _count = count; \
		const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
		\
		for (; _from < _end_from; _from += 8, _to += 8) \
		__const_memcpy_toio_aligned##bits(_to, _from, 8); \
		if ((_count % 8) >= 4) { \
		__const_memcpy_toio_aligned##bits(_to, _from, 4); \
		_from += 4; \
		_to += 4; \
		} \
		if ((_count % 4) >= 2) { \
		__const_memcpy_toio_aligned##bits(_to, _from, 2); \
		_from += 2; \
		_to += 2; \
		} \
		if (_count % 2) \
		__const_memcpy_toio_aligned##bits(_to, _from, 1); \
		})

		void __iowrite64_copy_full(void __iomem to, const void from, size_t count)
		{
		memcpy_toio_aligned(to, from, count, 64);
		dgh();
		}
		EXPORT_SYMBOL(__iowrite64_copy_full);

		void __iowrite32_copy_full(void __iomem to, const void from, size_t count)
		{
		memcpy_toio_aligned(to, from, count, 32);
		dgh();
		}
		EXPORT_SYMBOL(__iowrite32_copy_full);

		/*
		* Copy data from "real" memory space to IO memory space.
		*/

arch/s390/include/asm/io.h

+15 −0

Original line number	Diff line number	Diff line
		@@ -73,6 +73,21 @@ static inline void ioport_unmap(void __iomem *p)
		#define __raw_writel zpci_write_u32
		#define __raw_writeq zpci_write_u64

		/* combine single writes by using store-block insn */
		static inline void __iowrite32_copy(void __iomem to, const void from,
		size_t count)
		{
		zpci_memcpy_toio(to, from, count * 4);
		}
		#define __iowrite32_copy __iowrite32_copy

		static inline void __iowrite64_copy(void __iomem to, const void from,
		size_t count)
		{
		zpci_memcpy_toio(to, from, count * 8);
		}
		#define __iowrite64_copy __iowrite64_copy

		#endif /* CONFIG_PCI */

		#include <asm-generic/io.h>

arch/s390/pci/pci.c

+0 −6

Original line number	Diff line number	Diff line
		@@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void data, const struct resource res,
		return 0;
		}

		/* combine single writes by using store-block insn */
		void __iowrite64_copy(void __iomem to, const void from, size_t count)
		{
		zpci_memcpy_toio(to, from, count * 8);
		}

		void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
		unsigned long prot)
		{

arch/x86/include/asm/io.h

+17 −0

Original line number	Diff line number	Diff line
		@@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
		#define memcpy_toio memcpy_toio
		#define memset_io memset_io

		#ifdef CONFIG_X86_64
		/*
		* Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
		* x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
		* loop.
		*/
		static inline void __iowrite32_copy(void __iomem to, const void from,
		size_t count)
		{
		asm volatile("rep ; movsl"
		: "=&c"(count), "=&D"(to), "=&S"(from)
		: "0"(count), "1"(to), "2"(from)
		: "memory");
		}
		#define __iowrite32_copy __iowrite32_copy
		#endif

		/*
		* ISA space is 'always mapped' on a typical x86 system, no need to
		* explicitly ioremap() it. The fact that the ISA IO space is mapped