Unverified Commit 300ce44c authored by Palmer Dabbelt's avatar Palmer Dabbelt
Browse files

Merge patch series "Rework & improve riscv cmpxchg.h and atomic.h"

Leonardo Bras <leobras@redhat.com> says:

While studying riscv's cmpxchg.h file, I got really interested in
understanding how RISCV asm implemented the different versions of
{cmp,}xchg.

When I understood the pattern, it made sense for me to remove the
duplications and create macros to make it easier to understand what exactly
changes between the versions: Instruction sufixes & barriers.

Also, did the same kind of work on atomic.c.

After that, I noted both cmpxchg and xchg only accept variables of
size 4 and 8, compared to x86 and arm64 which do 1,2,4,8.

Now that deduplication is done, it is quite direct to implement them
for variable sizes 1 and 2, so I did it. Then Guo Ren already presented
me some possible users :)

I did compare the generated asm on a test.c that contained usage for every
changed function, and could not detect any change on patches 1 + 2 + 3
compared with upstream.

Pathes 4 & 5 were compiled-tested, merged with guoren/qspinlock_v11 and
booted just fine with qemu -machine virt -append "qspinlock".

(tree: https://gitlab.com/LeoBras/linux/-/commits/guo_qspinlock_v11)

Latest tests happened based on this tree:
https://github.com/guoren83/linux/tree/qspinlock_v12

* b4-shazam-lts:
  riscv/cmpxchg: Implement xchg for variables of size 1 and 2
  riscv/cmpxchg: Implement cmpxchg for variables of size 1 and 2
  riscv/atomic.h : Deduplicate arch_atomic.*
  riscv/cmpxchg: Deduplicate cmpxchg() asm and macros
  riscv/cmpxchg: Deduplicate xchg() asm functions

Link: https://lore.kernel.org/r/20240103163203.72768-2-leobras@redhat.com


Signed-off-by: default avatarPalmer Dabbelt <palmer@rivosinc.com>
parents 542124fc a8ed2b7a
Loading
Loading
Loading
Loading
+76 −88
Original line number Diff line number Diff line
@@ -195,22 +195,28 @@ ATOMIC_OPS(xor, xor, i)
#undef ATOMIC_FETCH_OP
#undef ATOMIC_OP_RETURN

#define _arch_atomic_fetch_add_unless(_prev, _rc, counter, _a, _u, sfx)	\
({									\
	__asm__ __volatile__ (						\
		"0:	lr." sfx "     %[p],  %[c]\n"			\
		"	beq	       %[p],  %[u], 1f\n"		\
		"	add            %[rc], %[p], %[a]\n"		\
		"	sc." sfx ".rl  %[rc], %[rc], %[c]\n"		\
		"	bnez           %[rc], 0b\n"			\
		"	fence          rw, rw\n"			\
		"1:\n"							\
		: [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter)	\
		: [a]"r" (_a), [u]"r" (_u)				\
		: "memory");						\
})

/* This is required to provide a full barrier on success. */
static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
       int prev, rc;

	__asm__ __volatile__ (
		"0:	lr.w     %[p],  %[c]\n"
		"	beq      %[p],  %[u], 1f\n"
		"	add      %[rc], %[p], %[a]\n"
		"	sc.w.rl  %[rc], %[rc], %[c]\n"
		"	bnez     %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		: [a]"r" (a), [u]"r" (u)
		: "memory");
	_arch_atomic_fetch_add_unless(prev, rc, v->counter, a, u, "w");

	return prev;
}
#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
@@ -221,77 +227,86 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a,
       s64 prev;
       long rc;

	__asm__ __volatile__ (
		"0:	lr.d     %[p],  %[c]\n"
		"	beq      %[p],  %[u], 1f\n"
		"	add      %[rc], %[p], %[a]\n"
		"	sc.d.rl  %[rc], %[rc], %[c]\n"
		"	bnez     %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		: [a]"r" (a), [u]"r" (u)
		: "memory");
	_arch_atomic_fetch_add_unless(prev, rc, v->counter, a, u, "d");

	return prev;
}
#define arch_atomic64_fetch_add_unless arch_atomic64_fetch_add_unless
#endif

#define _arch_atomic_inc_unless_negative(_prev, _rc, counter, sfx)	\
({									\
	__asm__ __volatile__ (						\
		"0:	lr." sfx "      %[p],  %[c]\n"			\
		"	bltz            %[p],  1f\n"			\
		"	addi            %[rc], %[p], 1\n"		\
		"	sc." sfx ".rl   %[rc], %[rc], %[c]\n"		\
		"	bnez            %[rc], 0b\n"			\
		"	fence           rw, rw\n"			\
		"1:\n"							\
		: [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter)	\
		:							\
		: "memory");						\
})

static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v)
{
	int prev, rc;

	__asm__ __volatile__ (
		"0:	lr.w      %[p],  %[c]\n"
		"	bltz      %[p],  1f\n"
		"	addi      %[rc], %[p], 1\n"
		"	sc.w.rl   %[rc], %[rc], %[c]\n"
		"	bnez      %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_inc_unless_negative(prev, rc, v->counter, "w");

	return !(prev < 0);
}

#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative

#define _arch_atomic_dec_unless_positive(_prev, _rc, counter, sfx)	\
({									\
	__asm__ __volatile__ (						\
		"0:	lr." sfx "      %[p],  %[c]\n"			\
		"	bgtz            %[p],  1f\n"			\
		"	addi            %[rc], %[p], -1\n"		\
		"	sc." sfx ".rl   %[rc], %[rc], %[c]\n"		\
		"	bnez            %[rc], 0b\n"			\
		"	fence           rw, rw\n"			\
		"1:\n"							\
		: [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter)	\
		:							\
		: "memory");						\
})

static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v)
{
	int prev, rc;

	__asm__ __volatile__ (
		"0:	lr.w      %[p],  %[c]\n"
		"	bgtz      %[p],  1f\n"
		"	addi      %[rc], %[p], -1\n"
		"	sc.w.rl   %[rc], %[rc], %[c]\n"
		"	bnez      %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_dec_unless_positive(prev, rc, v->counter, "w");

	return !(prev > 0);
}

#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive

#define _arch_atomic_dec_if_positive(_prev, _rc, counter, sfx)		\
({									\
	__asm__ __volatile__ (						\
		"0:	lr." sfx "     %[p],  %[c]\n"			\
		"	addi           %[rc], %[p], -1\n"		\
		"	bltz           %[rc], 1f\n"			\
		"	sc." sfx ".rl  %[rc], %[rc], %[c]\n"		\
		"	bnez           %[rc], 0b\n"			\
		"	fence          rw, rw\n"			\
		"1:\n"							\
		: [p]"=&r" (_prev), [rc]"=&r" (_rc), [c]"+A" (counter)	\
		:							\
		: "memory");						\
})

static __always_inline int arch_atomic_dec_if_positive(atomic_t *v)
{
       int prev, rc;

	__asm__ __volatile__ (
		"0:	lr.w     %[p],  %[c]\n"
		"	addi     %[rc], %[p], -1\n"
		"	bltz     %[rc], 1f\n"
		"	sc.w.rl  %[rc], %[rc], %[c]\n"
		"	bnez     %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_dec_if_positive(prev, rc, v->counter, "w");

	return prev - 1;
}

@@ -303,17 +318,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v)
	s64 prev;
	long rc;

	__asm__ __volatile__ (
		"0:	lr.d      %[p],  %[c]\n"
		"	bltz      %[p],  1f\n"
		"	addi      %[rc], %[p], 1\n"
		"	sc.d.rl   %[rc], %[rc], %[c]\n"
		"	bnez      %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_inc_unless_negative(prev, rc, v->counter, "d");

	return !(prev < 0);
}

@@ -324,17 +330,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v)
	s64 prev;
	long rc;

	__asm__ __volatile__ (
		"0:	lr.d      %[p],  %[c]\n"
		"	bgtz      %[p],  1f\n"
		"	addi      %[rc], %[p], -1\n"
		"	sc.d.rl   %[rc], %[rc], %[c]\n"
		"	bnez      %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_dec_unless_positive(prev, rc, v->counter, "d");

	return !(prev > 0);
}

@@ -345,17 +342,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
       s64 prev;
       long rc;

	__asm__ __volatile__ (
		"0:	lr.d     %[p],  %[c]\n"
		"	addi      %[rc], %[p], -1\n"
		"	bltz     %[rc], 1f\n"
		"	sc.d.rl  %[rc], %[rc], %[c]\n"
		"	bnez     %[rc], 0b\n"
		RISCV_FULL_BARRIER
		"1:\n"
		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
		:
		: "memory");
	_arch_atomic_dec_if_positive(prev, rc, v->counter, "d");

	return prev - 1;
}

+124 −280
Original line number Diff line number Diff line
@@ -10,140 +10,79 @@

#include <asm/fence.h>

#define __xchg_relaxed(ptr, new, size)					\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(new) __new = (new);					\
	__typeof__(*(ptr)) __ret;					\
	switch (size) {							\
	case 4:								\
		__asm__ __volatile__ (					\
			"	amoswap.w %0, %2, %1\n"			\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			"	amoswap.d %0, %2, %1\n"			\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
#define __arch_xchg_masked(prepend, append, r, p, n)			\
({									\
	u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3);			\
	ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE;	\
	ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)	\
			<< __s;						\
	ulong __newx = (ulong)(n) << __s;				\
	ulong __retx;							\
	ulong __rc;							\
									\
	__asm__ __volatile__ (						\
	       prepend							\
	       "0:	lr.w %0, %2\n"					\
	       "	and  %1, %0, %z4\n"				\
	       "	or   %1, %1, %z3\n"				\
	       "	sc.w %1, %1, %2\n"				\
	       "	bnez %1, 0b\n"					\
	       append							\
	       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
	       : "rJ" (__newx), "rJ" (~__mask)				\
	       : "memory");						\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})

#define arch_xchg_relaxed(ptr, x)					\
({									\
	__typeof__(*(ptr)) _x_ = (x);					\
	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
					    _x_, sizeof(*(ptr)));	\
									\
	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
})

#define __xchg_acquire(ptr, new, size)					\
#define __arch_xchg(sfx, prepend, append, r, p, n)			\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(new) __new = (new);					\
	__typeof__(*(ptr)) __ret;					\
	switch (size) {							\
	case 4:								\
	__asm__ __volatile__ (						\
			"	amoswap.w %0, %2, %1\n"			\
			RISCV_ACQUIRE_BARRIER				\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
		prepend							\
		"	amoswap" sfx " %0, %2, %1\n"			\
		append							\
		: "=r" (r), "+A" (*(p))					\
		: "r" (n)						\
		: "memory");						\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			"	amoswap.d %0, %2, %1\n"			\
			RISCV_ACQUIRE_BARRIER				\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})

#define arch_xchg_acquire(ptr, x)					\
({									\
	__typeof__(*(ptr)) _x_ = (x);					\
	(__typeof__(*(ptr))) __xchg_acquire((ptr),			\
					    _x_, sizeof(*(ptr)));	\
})

#define __xchg_release(ptr, new, size)					\
#define _arch_xchg(ptr, new, sfx, prepend, append)			\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(new) __new = (new);					\
	__typeof__(*(ptr)) __ret;					\
	switch (size) {							\
	__typeof__(*(__ptr)) __new = (new);				\
	__typeof__(*(__ptr)) __ret;					\
									\
	switch (sizeof(*__ptr)) {					\
	case 1:								\
	case 2:								\
		__arch_xchg_masked(prepend, append,			\
				   __ret, __ptr, __new);		\
		break;							\
	case 4:								\
		__asm__ __volatile__ (					\
			RISCV_RELEASE_BARRIER				\
			"	amoswap.w %0, %2, %1\n"			\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		__arch_xchg(".w" sfx, prepend, append,			\
			      __ret, __ptr, __new);			\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			RISCV_RELEASE_BARRIER				\
			"	amoswap.d %0, %2, %1\n"			\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		__arch_xchg(".d" sfx, prepend, append,			\
			      __ret, __ptr, __new);			\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
	(__typeof__(*(__ptr)))__ret;					\
})

#define arch_xchg_release(ptr, x)					\
({									\
	__typeof__(*(ptr)) _x_ = (x);					\
	(__typeof__(*(ptr))) __xchg_release((ptr),			\
					    _x_, sizeof(*(ptr)));	\
})
#define arch_xchg_relaxed(ptr, x)					\
	_arch_xchg(ptr, x, "", "", "")

#define __arch_xchg(ptr, new, size)					\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(new) __new = (new);					\
	__typeof__(*(ptr)) __ret;					\
	switch (size) {							\
	case 4:								\
		__asm__ __volatile__ (					\
			"	amoswap.w.aqrl %0, %2, %1\n"		\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			"	amoswap.d.aqrl %0, %2, %1\n"		\
			: "=r" (__ret), "+A" (*__ptr)			\
			: "r" (__new)					\
			: "memory");					\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})
#define arch_xchg_acquire(ptr, x)					\
	_arch_xchg(ptr, x, "", "", RISCV_ACQUIRE_BARRIER)

#define arch_xchg_release(ptr, x)					\
	_arch_xchg(ptr, x, "", RISCV_RELEASE_BARRIER, "")

#define arch_xchg(ptr, x)						\
({									\
	__typeof__(*(ptr)) _x_ = (x);					\
	(__typeof__(*(ptr))) __arch_xchg((ptr), _x_, sizeof(*(ptr)));	\
})
	_arch_xchg(ptr, x, ".aqrl", "", "")

#define xchg32(ptr, x)							\
({									\
@@ -162,190 +101,95 @@
 * store NEW in MEM.  Return the initial value in MEM.  Success is
 * indicated by comparing RETURN with OLD.
 */
#define __cmpxchg_relaxed(ptr, old, new, size)				\

#define __arch_cmpxchg_masked(sc_sfx, prepend, append, r, p, o, n)	\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(*(ptr)) __old = (old);				\
	__typeof__(*(ptr)) __new = (new);				\
	__typeof__(*(ptr)) __ret;					\
	register unsigned int __rc;					\
	switch (size) {							\
	case 4:								\
	u32 *__ptr32b = (u32 *)((ulong)(p) & ~0x3);			\
	ulong __s = ((ulong)(p) & (0x4 - sizeof(*p))) * BITS_PER_BYTE;	\
	ulong __mask = GENMASK(((sizeof(*p)) * BITS_PER_BYTE) - 1, 0)	\
			<< __s;						\
	ulong __newx = (ulong)(n) << __s;				\
	ulong __oldx = (ulong)(o) << __s;				\
	ulong __retx;							\
	ulong __rc;							\
									\
	__asm__ __volatile__ (						\
		prepend							\
		"0:	lr.w %0, %2\n"					\
			"	bne  %0, %z3, 1f\n"			\
			"	sc.w %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" ((long)__old), "rJ" (__new)		\
			: "memory");					\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			"0:	lr.d %0, %2\n"				\
			"	bne %0, %z3, 1f\n"			\
			"	sc.d %1, %z4, %2\n"			\
		"	and  %1, %0, %z5\n"				\
		"	bne  %1, %z3, 1f\n"				\
		"	and  %1, %0, %z6\n"				\
		"	or   %1, %1, %z4\n"				\
		"	sc.w" sc_sfx " %1, %1, %2\n"			\
		"	bnez %1, 0b\n"					\
		append							\
		"1:\n"							\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" (__old), "rJ" (__new)			\
		: "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
		: "rJ" ((long)__oldx), "rJ" (__newx),			\
		  "rJ" (__mask), "rJ" (~__mask)				\
		: "memory");						\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})

#define arch_cmpxchg_relaxed(ptr, o, n)					\
({									\
	__typeof__(*(ptr)) _o_ = (o);					\
	__typeof__(*(ptr)) _n_ = (n);					\
	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
					_o_, _n_, sizeof(*(ptr)));	\
									\
	r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
})

#define __cmpxchg_acquire(ptr, old, new, size)				\
#define __arch_cmpxchg(lr_sfx, sc_sfx, prepend, append, r, p, co, o, n)	\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(*(ptr)) __old = (old);				\
	__typeof__(*(ptr)) __new = (new);				\
	__typeof__(*(ptr)) __ret;					\
	register unsigned int __rc;					\
	switch (size) {							\
	case 4:								\
		__asm__ __volatile__ (					\
			"0:	lr.w %0, %2\n"				\
			"	bne  %0, %z3, 1f\n"			\
			"	sc.w %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			RISCV_ACQUIRE_BARRIER				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" ((long)__old), "rJ" (__new)		\
			: "memory");					\
		break;							\
	case 8:								\
									\
	__asm__ __volatile__ (						\
			"0:	lr.d %0, %2\n"				\
		prepend							\
		"0:	lr" lr_sfx " %0, %2\n"				\
		"	bne  %0, %z3, 1f\n"				\
			"	sc.d %1, %z4, %2\n"			\
		"	sc" sc_sfx " %1, %z4, %2\n"			\
		"	bnez %1, 0b\n"					\
			RISCV_ACQUIRE_BARRIER				\
		append							\
		"1:\n"							\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" (__old), "rJ" (__new)			\
		: "=&r" (r), "=&r" (__rc), "+A" (*(p))			\
		: "rJ" (co o), "rJ" (n)					\
		: "memory");						\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})

#define arch_cmpxchg_acquire(ptr, o, n)					\
({									\
	__typeof__(*(ptr)) _o_ = (o);					\
	__typeof__(*(ptr)) _n_ = (n);					\
	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
					_o_, _n_, sizeof(*(ptr)));	\
})

#define __cmpxchg_release(ptr, old, new, size)				\
#define _arch_cmpxchg(ptr, old, new, sc_sfx, prepend, append)		\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(*(ptr)) __old = (old);				\
	__typeof__(*(ptr)) __new = (new);				\
	__typeof__(*(ptr)) __ret;					\
	register unsigned int __rc;					\
	switch (size) {							\
	__typeof__(*(__ptr)) __old = (old);				\
	__typeof__(*(__ptr)) __new = (new);				\
	__typeof__(*(__ptr)) __ret;					\
									\
	switch (sizeof(*__ptr)) {					\
	case 1:								\
	case 2:								\
		__arch_cmpxchg_masked(sc_sfx, prepend, append,		\
					__ret, __ptr, __old, __new);	\
		break;							\
	case 4:								\
		__asm__ __volatile__ (					\
			RISCV_RELEASE_BARRIER				\
			"0:	lr.w %0, %2\n"				\
			"	bne  %0, %z3, 1f\n"			\
			"	sc.w %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" ((long)__old), "rJ" (__new)		\
			: "memory");					\
		__arch_cmpxchg(".w", ".w" sc_sfx, prepend, append,	\
				__ret, __ptr, (long), __old, __new);	\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			RISCV_RELEASE_BARRIER				\
			"0:	lr.d %0, %2\n"				\
			"	bne %0, %z3, 1f\n"			\
			"	sc.d %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" (__old), "rJ" (__new)			\
			: "memory");					\
		__arch_cmpxchg(".d", ".d" sc_sfx, prepend, append,	\
				__ret, __ptr, /**/, __old, __new);	\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
	(__typeof__(*(__ptr)))__ret;					\
})

#define arch_cmpxchg_release(ptr, o, n)					\
({									\
	__typeof__(*(ptr)) _o_ = (o);					\
	__typeof__(*(ptr)) _n_ = (n);					\
	(__typeof__(*(ptr))) __cmpxchg_release((ptr),			\
					_o_, _n_, sizeof(*(ptr)));	\
})
#define arch_cmpxchg_relaxed(ptr, o, n)					\
	_arch_cmpxchg((ptr), (o), (n), "", "", "")

#define __cmpxchg(ptr, old, new, size)					\
({									\
	__typeof__(ptr) __ptr = (ptr);					\
	__typeof__(*(ptr)) __old = (old);				\
	__typeof__(*(ptr)) __new = (new);				\
	__typeof__(*(ptr)) __ret;					\
	register unsigned int __rc;					\
	switch (size) {							\
	case 4:								\
		__asm__ __volatile__ (					\
			"0:	lr.w %0, %2\n"				\
			"	bne  %0, %z3, 1f\n"			\
			"	sc.w.rl %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			RISCV_FULL_BARRIER				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" ((long)__old), "rJ" (__new)		\
			: "memory");					\
		break;							\
	case 8:								\
		__asm__ __volatile__ (					\
			"0:	lr.d %0, %2\n"				\
			"	bne %0, %z3, 1f\n"			\
			"	sc.d.rl %1, %z4, %2\n"			\
			"	bnez %1, 0b\n"				\
			RISCV_FULL_BARRIER				\
			"1:\n"						\
			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
			: "rJ" (__old), "rJ" (__new)			\
			: "memory");					\
		break;							\
	default:							\
		BUILD_BUG();						\
	}								\
	__ret;								\
})
#define arch_cmpxchg_acquire(ptr, o, n)					\
	_arch_cmpxchg((ptr), (o), (n), "", "", RISCV_ACQUIRE_BARRIER)

#define arch_cmpxchg_release(ptr, o, n)					\
	_arch_cmpxchg((ptr), (o), (n), "", RISCV_RELEASE_BARRIER, "")

#define arch_cmpxchg(ptr, o, n)						\
({									\
	__typeof__(*(ptr)) _o_ = (o);					\
	__typeof__(*(ptr)) _n_ = (n);					\
	(__typeof__(*(ptr))) __cmpxchg((ptr),				\
				       _o_, _n_, sizeof(*(ptr)));	\
})
	_arch_cmpxchg((ptr), (o), (n), ".rl", "", "	fence rw, rw\n")

#define arch_cmpxchg_local(ptr, o, n)					\
	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
	arch_cmpxchg_relaxed((ptr), (o), (n))

#define arch_cmpxchg64(ptr, o, n)					\
({									\