S/390: Unroll mvc/xc loop for memset with small constant

lengths.

When expanding a memset we emit a loop of MVCs/XCs instructions dealing
with 256 byte blocks.  This loop used to get unrolled with older GCCs
when using constant length operands.  GCC lost this ability probably
when more of the loop unrolling stuff has been moved to tree level.

With this patch the unrolling is done manually when emitting the RTL
insns.

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* gcc.target/s390/memset-1.c: New test.

gcc/ChangeLog:

2017-01-05  Andreas Krebbel  <krebbel@linux.vnet.ibm.com>

	* config/s390/s390.c (s390_expand_setmem): Unroll the loop for
	small constant length operands.

From-SVN: r244097
This commit is contained in:
Andreas Krebbel 2017-01-05 10:00:34 +00:00 committed by Andreas Krebbel
parent 587790e60d
commit 8597cd335e
3 changed files with 173 additions and 22 deletions

View File

@ -1,3 +1,8 @@
2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
* config/s390/s390.c (s390_expand_setmem): Unroll the loop for
small constant length operands.
2017-01-05 Andreas Krebbel <krebbel@linux.vnet.ibm.com>
* config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes

View File

@ -5348,34 +5348,46 @@ s390_expand_setmem (rtx dst, rtx len, rtx val)
{
const int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0)
if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0)
return;
gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode);
if (GET_CODE (len) == CONST_INT && INTVAL (len) > 0 && INTVAL (len) <= 257)
/* Expand setmem/clrmem for a constant length operand without a
loop if it will be shorter that way.
With a constant length and without pfd argument a
clrmem loop is 32 bytes -> 5.3 * xc
setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */
if (GET_CODE (len) == CONST_INT
&& ((INTVAL (len) <= 256 * 5 && val == const0_rtx)
|| INTVAL (len) <= 257 * 3)
&& (!TARGET_MVCLE || INTVAL (len) <= 256))
{
if (val == const0_rtx && INTVAL (len) <= 256)
emit_insn (gen_clrmem_short (dst, GEN_INT (INTVAL (len) - 1)));
HOST_WIDE_INT o, l;
if (val == const0_rtx)
/* clrmem: emit 256 byte blockwise XCs. */
for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256)
{
rtx newdst = adjust_address (dst, BLKmode, o);
emit_insn (gen_clrmem_short (newdst,
GEN_INT (l > 256 ? 255 : l - 1)));
}
else
{
/* Initialize memory by storing the first byte. */
emit_move_insn (adjust_address (dst, QImode, 0), val);
if (INTVAL (len) > 1)
{
/* Initiate 1 byte overlap move.
The first byte of DST is propagated through DSTP1.
Prepare a movmem for: DST+1 = DST (length = LEN - 1).
DST is set to size 1 so the rest of the memory location
does not count as source operand. */
rtx dstp1 = adjust_address (dst, VOIDmode, 1);
set_mem_size (dst, 1);
emit_insn (gen_movmem_short (dstp1, dst,
GEN_INT (INTVAL (len) - 2)));
}
}
/* setmem: emit 1(mvi) + 256(mvc) byte blockwise memsets by
setting first byte to val and using a 256 byte mvc with one
byte overlap to propagate the byte. */
for (l = INTVAL (len), o = 0; l > 0; l -= 257, o += 257)
{
rtx newdst = adjust_address (dst, BLKmode, o);
emit_move_insn (adjust_address (dst, QImode, o), val);
if (l > 1)
{
rtx newdstp1 = adjust_address (dst, BLKmode, o + 1);
emit_insn (gen_movmem_short (newdstp1, newdst,
GEN_INT (l > 257 ? 255 : l - 2)));
}
}
}
else if (TARGET_MVCLE)

View File

@ -0,0 +1,134 @@
/* Make sure that short memset's with constant length are emitted
without loop statements. */
/* { dg-do compile } */
/* { dg-options "-O3 -mzarch" } */
/* 1 mvc */
void
*memset1(void *s, int c)
{
return __builtin_memset (s, c, 42);
}
/* 3 mvc */
void
*memset2(void *s, int c)
{
return __builtin_memset (s, c, 700);
}
/* nop */
void
*memset3(void *s, int c)
{
return __builtin_memset (s, c, 0);
}
/* mvc */
void
*memset4(void *s, int c)
{
return __builtin_memset (s, c, 256);
}
/* 2 mvc */
void
*memset5(void *s, int c)
{
return __builtin_memset (s, c, 512);
}
/* still 2 mvc through the additional first byte */
void
*memset6(void *s, int c)
{
return __builtin_memset (s, c, 514);
}
/* 3 mvc */
void
*memset7(void *s, int c)
{
return __builtin_memset (s, c, 515);
}
/* still 3 mvc through the additional first byte */
void
*memset8(void *s, int c)
{
return __builtin_memset (s, c, 771);
}
/* Use mvc loop: 2 mvc */
void
*memset9(void *s, int c)
{
return __builtin_memset (s, c, 772);
}
/* 3 mvc with displacement overflow after the first */
void
*memset10(void *s, int c)
{
return __builtin_memset ((char*)s + 4000, c, 700);
}
/* 1 xc */
void
*clrmem1(void *s)
{
return __builtin_memset (s, 0, 42);
}
/* 3 xc */
void
*clrmem2(void *s)
{
return __builtin_memset (s, 0, 700);
}
/* nop */
void
*clrmem3(void *s)
{
return __builtin_memset (s, 0, 0);
}
/* 1 xc */
void
*clrmem4(void *s)
{
return __builtin_memset (s, 0, 256);
}
/* 2 xc */
void
*clrmem5(void *s)
{
return __builtin_memset (s, 0, 512);
}
/* 3 xc */
void
*clrmem6(void *s)
{
return __builtin_memset (s, 0, 768);
}
/* start using xc loop */
void
*clrmem7(void *s)
{
return __builtin_memset (s, 0, 1281);
}
/* 3 xc with displacement overflow after the first */
void
*clrmem8(void *s)
{
return __builtin_memset (s + 4000, 0, 700);
}
/* { dg-final { scan-assembler-times "mvc" 19 } } */
/* { dg-final { scan-assembler-times "xc" 15 } } */