diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6c47cb876ec7..a3e3a6a7d8b3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2017-01-05 Andreas Krebbel + + * config/s390/s390.c (s390_expand_setmem): Unroll the loop for + small constant length operands. + 2017-01-05 Andreas Krebbel * config/s390/s390.c (s390_expand_setmem): Avoid overlapping bytes diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c index 257bce7fa093..1266f45fb1f9 100644 --- a/gcc/config/s390/s390.c +++ b/gcc/config/s390/s390.c @@ -5348,34 +5348,46 @@ s390_expand_setmem (rtx dst, rtx len, rtx val) { const int very_unlikely = REG_BR_PROB_BASE / 100 - 1; - if (GET_CODE (len) == CONST_INT && INTVAL (len) == 0) + if (GET_CODE (len) == CONST_INT && INTVAL (len) <= 0) return; gcc_assert (GET_CODE (val) == CONST_INT || GET_MODE (val) == QImode); - if (GET_CODE (len) == CONST_INT && INTVAL (len) > 0 && INTVAL (len) <= 257) + /* Expand setmem/clrmem for a constant length operand without a + loop if it will be shorter that way. + With a constant length and without pfd argument a + clrmem loop is 32 bytes -> 5.3 * xc + setmem loop is 36 bytes -> 3.6 * (mvi/stc + mvc) */ + if (GET_CODE (len) == CONST_INT + && ((INTVAL (len) <= 256 * 5 && val == const0_rtx) + || INTVAL (len) <= 257 * 3) + && (!TARGET_MVCLE || INTVAL (len) <= 256)) { - if (val == const0_rtx && INTVAL (len) <= 256) - emit_insn (gen_clrmem_short (dst, GEN_INT (INTVAL (len) - 1))); + HOST_WIDE_INT o, l; + + if (val == const0_rtx) + /* clrmem: emit 256 byte blockwise XCs. */ + for (l = INTVAL (len), o = 0; l > 0; l -= 256, o += 256) + { + rtx newdst = adjust_address (dst, BLKmode, o); + emit_insn (gen_clrmem_short (newdst, + GEN_INT (l > 256 ? 255 : l - 1))); + } else - { - /* Initialize memory by storing the first byte. */ - emit_move_insn (adjust_address (dst, QImode, 0), val); - - if (INTVAL (len) > 1) - { - /* Initiate 1 byte overlap move. - The first byte of DST is propagated through DSTP1. - Prepare a movmem for: DST+1 = DST (length = LEN - 1). - DST is set to size 1 so the rest of the memory location - does not count as source operand. */ - rtx dstp1 = adjust_address (dst, VOIDmode, 1); - set_mem_size (dst, 1); - - emit_insn (gen_movmem_short (dstp1, dst, - GEN_INT (INTVAL (len) - 2))); - } - } + /* setmem: emit 1(mvi) + 256(mvc) byte blockwise memsets by + setting first byte to val and using a 256 byte mvc with one + byte overlap to propagate the byte. */ + for (l = INTVAL (len), o = 0; l > 0; l -= 257, o += 257) + { + rtx newdst = adjust_address (dst, BLKmode, o); + emit_move_insn (adjust_address (dst, QImode, o), val); + if (l > 1) + { + rtx newdstp1 = adjust_address (dst, BLKmode, o + 1); + emit_insn (gen_movmem_short (newdstp1, newdst, + GEN_INT (l > 257 ? 255 : l - 2))); + } + } } else if (TARGET_MVCLE) diff --git a/gcc/testsuite/gcc.target/s390/memset-1.c b/gcc/testsuite/gcc.target/s390/memset-1.c new file mode 100644 index 000000000000..7b43b97c2085 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/memset-1.c @@ -0,0 +1,134 @@ +/* Make sure that short memset's with constant length are emitted + without loop statements. */ + +/* { dg-do compile } */ +/* { dg-options "-O3 -mzarch" } */ + +/* 1 mvc */ +void +*memset1(void *s, int c) +{ + return __builtin_memset (s, c, 42); +} + +/* 3 mvc */ +void +*memset2(void *s, int c) +{ + return __builtin_memset (s, c, 700); +} + +/* nop */ +void +*memset3(void *s, int c) +{ + return __builtin_memset (s, c, 0); +} + +/* mvc */ +void +*memset4(void *s, int c) +{ + return __builtin_memset (s, c, 256); +} + +/* 2 mvc */ +void +*memset5(void *s, int c) +{ + return __builtin_memset (s, c, 512); +} + +/* still 2 mvc through the additional first byte */ +void +*memset6(void *s, int c) +{ + return __builtin_memset (s, c, 514); +} + +/* 3 mvc */ +void +*memset7(void *s, int c) +{ + return __builtin_memset (s, c, 515); +} + +/* still 3 mvc through the additional first byte */ +void +*memset8(void *s, int c) +{ + return __builtin_memset (s, c, 771); +} + +/* Use mvc loop: 2 mvc */ +void +*memset9(void *s, int c) +{ + return __builtin_memset (s, c, 772); +} + +/* 3 mvc with displacement overflow after the first */ +void +*memset10(void *s, int c) +{ + return __builtin_memset ((char*)s + 4000, c, 700); +} + +/* 1 xc */ +void +*clrmem1(void *s) +{ + return __builtin_memset (s, 0, 42); +} + +/* 3 xc */ +void +*clrmem2(void *s) +{ + return __builtin_memset (s, 0, 700); +} + +/* nop */ +void +*clrmem3(void *s) +{ + return __builtin_memset (s, 0, 0); +} + +/* 1 xc */ +void +*clrmem4(void *s) +{ + return __builtin_memset (s, 0, 256); +} + +/* 2 xc */ +void +*clrmem5(void *s) +{ + return __builtin_memset (s, 0, 512); +} + +/* 3 xc */ +void +*clrmem6(void *s) +{ + return __builtin_memset (s, 0, 768); +} + +/* start using xc loop */ +void +*clrmem7(void *s) +{ + return __builtin_memset (s, 0, 1281); +} + +/* 3 xc with displacement overflow after the first */ +void +*clrmem8(void *s) +{ + return __builtin_memset (s + 4000, 0, 700); +} + +/* { dg-final { scan-assembler-times "mvc" 19 } } */ +/* { dg-final { scan-assembler-times "xc" 15 } } */