i386: Extract the guts of mulv16qi3 to ix86_expand_vecop_qihi

* config/i386/sse.md (mul<VI1_AVX2>3): Change from insn_and_split to
        pure expander; move expansion code ...
        * config/i386/i386.c (ix86_expand_vecop_qihi): ... here.  New function.
        * config/i386/i386-protos.h: Update.

From-SVN: r188907
This commit is contained in:
Richard Henderson 2012-06-23 09:41:53 -07:00 committed by Richard Henderson
parent 6b39c806f3
commit 77a3dbf6c4
4 changed files with 97 additions and 60 deletions

View File

@ -1,3 +1,10 @@
2012-06-23 Richard Henderson <rth@redhat.com>
* config/i386/sse.md (mul<VI1_AVX2>3): Change from insn_and_split to
pure expander; move expansion code ...
* config/i386/i386.c (ix86_expand_vecop_qihi): ... here. New function.
* config/i386/i386-protos.h: Update.
2012-06-22 Edmar Wienskoski <edmar@freescale.com>
* config/rs6000/rs6000.md (define_attr "type"): New type popcnt.

View File

@ -192,6 +192,8 @@ extern void ix86_expand_rounddf_32 (rtx, rtx);
extern void ix86_expand_trunc (rtx, rtx);
extern void ix86_expand_truncdf_32 (rtx, rtx);
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
#endif /* TREE_CODE */

View File

@ -38438,6 +38438,91 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
expand_vec_perm_even_odd_1 (&d, odd);
}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
void
ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
enum machine_mode qimode = GET_MODE (dest);
enum machine_mode himode;
rtx (*gen_il) (rtx, rtx, rtx);
rtx (*gen_ih) (rtx, rtx, rtx);
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
struct expand_vec_perm_d d;
bool ok;
int i;
if (qimode == V16QImode)
{
himode = V8HImode;
gen_il = gen_vec_interleave_lowv16qi;
gen_ih = gen_vec_interleave_highv16qi;
}
else if (qimode == V32QImode)
{
himode = V16HImode;
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
}
else
gcc_unreachable ();
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
op1_l = gen_reg_rtx (qimode);
op1_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op1_l, op1, op1));
emit_insn (gen_ih (op1_h, op1, op1));
op2_l = gen_reg_rtx (qimode);
op2_h = gen_reg_rtx (qimode);
emit_insn (gen_il (op2_l, op2, op2));
emit_insn (gen_ih (op2_h, op2, op2));
/* Perform the operation. */
res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
gen_lowpart (himode, op2_l), NULL_RTX,
1, OPTAB_DIRECT);
res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
gen_lowpart (himode, op2_h), NULL_RTX,
1, OPTAB_DIRECT);
gcc_assert (res_l && res_h);
/* Merge the data back into the right place. */
d.target = dest;
d.op0 = gen_lowpart (qimode, res_l);
d.op1 = gen_lowpart (qimode, res_h);
d.vmode = qimode;
d.nelt = GET_MODE_NUNITS (qimode);
d.one_operand_p = false;
d.testing_p = false;
if (qimode == V16QImode)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
for (i = 0; i < 16; ++i)
d.perm[i] = i * 2;
}
else
{
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
for (i = 0; i < 32; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
ok = ix86_expand_vec_perm_const_1 (&d);
gcc_assert (ok);
set_unique_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_fmt_ee (code, qimode, op1, op2));
}
void
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
{

View File

@ -5213,70 +5213,13 @@
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
(define_insn_and_split "mul<mode>3"
(define_expand "mul<mode>3"
[(set (match_operand:VI1_AVX2 0 "register_operand")
(mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
(match_operand:VI1_AVX2 2 "register_operand")))]
"TARGET_SSE2
&& can_create_pseudo_p ()"
"#"
"&& 1"
[(const_int 0)]
"TARGET_SSE2"
{
rtx t[6];
int i;
enum machine_mode mulmode = <sseunpackmode>mode;
for (i = 0; i < 6; ++i)
t[i] = gen_reg_rtx (<MODE>mode);
/* Unpack data such that we've got a source byte in each low byte of
each word. We don't care what goes into the high byte of each word.
Rather than trying to get zero in there, most convenient is to let
it be a copy of the low byte. */
emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[0], operands[1],
operands[1]));
emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[1], operands[2],
operands[2]));
emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[2], operands[1],
operands[1]));
emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[3], operands[2],
operands[2]));
/* Multiply words. The end-of-line annotations here give a picture of what
the output of that instruction looks like. Dot means don't care; the
letters are the bytes of the result with A being the most significant. */
emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[4]),
gen_rtx_MULT (mulmode, /* .A.B.C.D.E.F.G.H */
gen_lowpart (mulmode, t[0]),
gen_lowpart (mulmode, t[1]))));
emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[5]),
gen_rtx_MULT (mulmode, /* .I.J.K.L.M.N.O.P */
gen_lowpart (mulmode, t[2]),
gen_lowpart (mulmode, t[3]))));
/* Extract the even bytes and merge them back together. */
if (<MODE>mode == V16QImode)
ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0);
else
{
/* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane,
this can't be normal even extraction, but one where additionally
the second and third quarter are swapped. That is even one insn
shorter than even extraction. */
rtvec v = rtvec_alloc (32);
for (i = 0; i < 32; ++i)
RTVEC_ELT (v, i)
= GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0));
t[0] = operands[0];
t[1] = t[5];
t[2] = t[4];
t[3] = gen_rtx_CONST_VECTOR (<MODE>mode, v);
ix86_expand_vec_perm_const (t);
}
set_unique_reg_note (get_last_insn (), REG_EQUAL,
gen_rtx_MULT (<MODE>mode, operands[1], operands[2]));
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
DONE;
})