mirror of git://gcc.gnu.org/git/gcc.git
i386: Extract the guts of mulv16qi3 to ix86_expand_vecop_qihi
* config/i386/sse.md (mul<VI1_AVX2>3): Change from insn_and_split to
pure expander; move expansion code ...
* config/i386/i386.c (ix86_expand_vecop_qihi): ... here. New function.
* config/i386/i386-protos.h: Update.
From-SVN: r188907
This commit is contained in:
parent
6b39c806f3
commit
77a3dbf6c4
|
|
@ -1,3 +1,10 @@
|
|||
2012-06-23 Richard Henderson <rth@redhat.com>
|
||||
|
||||
* config/i386/sse.md (mul<VI1_AVX2>3): Change from insn_and_split to
|
||||
pure expander; move expansion code ...
|
||||
* config/i386/i386.c (ix86_expand_vecop_qihi): ... here. New function.
|
||||
* config/i386/i386-protos.h: Update.
|
||||
|
||||
2012-06-22 Edmar Wienskoski <edmar@freescale.com>
|
||||
|
||||
* config/rs6000/rs6000.md (define_attr "type"): New type popcnt.
|
||||
|
|
|
|||
|
|
@ -192,6 +192,8 @@ extern void ix86_expand_rounddf_32 (rtx, rtx);
|
|||
extern void ix86_expand_trunc (rtx, rtx);
|
||||
extern void ix86_expand_truncdf_32 (rtx, rtx);
|
||||
|
||||
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
|
||||
|
||||
#ifdef TREE_CODE
|
||||
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
|
||||
#endif /* TREE_CODE */
|
||||
|
|
|
|||
|
|
@ -38438,6 +38438,91 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
|
|||
expand_vec_perm_even_odd_1 (&d, odd);
|
||||
}
|
||||
|
||||
/* Expand a vector operation CODE for a V*QImode in terms of the
|
||||
same operation on V*HImode. */
|
||||
|
||||
void
|
||||
ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
|
||||
{
|
||||
enum machine_mode qimode = GET_MODE (dest);
|
||||
enum machine_mode himode;
|
||||
rtx (*gen_il) (rtx, rtx, rtx);
|
||||
rtx (*gen_ih) (rtx, rtx, rtx);
|
||||
rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
|
||||
struct expand_vec_perm_d d;
|
||||
bool ok;
|
||||
int i;
|
||||
|
||||
if (qimode == V16QImode)
|
||||
{
|
||||
himode = V8HImode;
|
||||
gen_il = gen_vec_interleave_lowv16qi;
|
||||
gen_ih = gen_vec_interleave_highv16qi;
|
||||
}
|
||||
else if (qimode == V32QImode)
|
||||
{
|
||||
himode = V16HImode;
|
||||
gen_il = gen_avx2_interleave_lowv32qi;
|
||||
gen_ih = gen_avx2_interleave_highv32qi;
|
||||
}
|
||||
else
|
||||
gcc_unreachable ();
|
||||
|
||||
/* Unpack data such that we've got a source byte in each low byte of
|
||||
each word. We don't care what goes into the high byte of each word.
|
||||
Rather than trying to get zero in there, most convenient is to let
|
||||
it be a copy of the low byte. */
|
||||
op1_l = gen_reg_rtx (qimode);
|
||||
op1_h = gen_reg_rtx (qimode);
|
||||
emit_insn (gen_il (op1_l, op1, op1));
|
||||
emit_insn (gen_ih (op1_h, op1, op1));
|
||||
|
||||
op2_l = gen_reg_rtx (qimode);
|
||||
op2_h = gen_reg_rtx (qimode);
|
||||
emit_insn (gen_il (op2_l, op2, op2));
|
||||
emit_insn (gen_ih (op2_h, op2, op2));
|
||||
|
||||
/* Perform the operation. */
|
||||
res_l = expand_simple_binop (himode, code, gen_lowpart (himode, op1_l),
|
||||
gen_lowpart (himode, op2_l), NULL_RTX,
|
||||
1, OPTAB_DIRECT);
|
||||
res_h = expand_simple_binop (himode, code, gen_lowpart (himode, op1_h),
|
||||
gen_lowpart (himode, op2_h), NULL_RTX,
|
||||
1, OPTAB_DIRECT);
|
||||
gcc_assert (res_l && res_h);
|
||||
|
||||
/* Merge the data back into the right place. */
|
||||
d.target = dest;
|
||||
d.op0 = gen_lowpart (qimode, res_l);
|
||||
d.op1 = gen_lowpart (qimode, res_h);
|
||||
d.vmode = qimode;
|
||||
d.nelt = GET_MODE_NUNITS (qimode);
|
||||
d.one_operand_p = false;
|
||||
d.testing_p = false;
|
||||
|
||||
if (qimode == V16QImode)
|
||||
{
|
||||
/* For SSE2, we used an full interleave, so the desired
|
||||
results are in the even elements. */
|
||||
for (i = 0; i < 16; ++i)
|
||||
d.perm[i] = i * 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* For AVX, the interleave used above was not cross-lane. So the
|
||||
extraction is evens but with the second and third quarter swapped.
|
||||
Happily, that is even one insn shorter than even extraction. */
|
||||
for (i = 0; i < 32; ++i)
|
||||
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
|
||||
}
|
||||
|
||||
ok = ix86_expand_vec_perm_const_1 (&d);
|
||||
gcc_assert (ok);
|
||||
|
||||
set_unique_reg_note (get_last_insn (), REG_EQUAL,
|
||||
gen_rtx_fmt_ee (code, qimode, op1, op2));
|
||||
}
|
||||
|
||||
void
|
||||
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -5213,70 +5213,13 @@
|
|||
(set_attr "prefix" "orig,vex")
|
||||
(set_attr "mode" "TI")])
|
||||
|
||||
(define_insn_and_split "mul<mode>3"
|
||||
(define_expand "mul<mode>3"
|
||||
[(set (match_operand:VI1_AVX2 0 "register_operand")
|
||||
(mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
|
||||
(match_operand:VI1_AVX2 2 "register_operand")))]
|
||||
"TARGET_SSE2
|
||||
&& can_create_pseudo_p ()"
|
||||
"#"
|
||||
"&& 1"
|
||||
[(const_int 0)]
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
rtx t[6];
|
||||
int i;
|
||||
enum machine_mode mulmode = <sseunpackmode>mode;
|
||||
|
||||
for (i = 0; i < 6; ++i)
|
||||
t[i] = gen_reg_rtx (<MODE>mode);
|
||||
|
||||
/* Unpack data such that we've got a source byte in each low byte of
|
||||
each word. We don't care what goes into the high byte of each word.
|
||||
Rather than trying to get zero in there, most convenient is to let
|
||||
it be a copy of the low byte. */
|
||||
emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[0], operands[1],
|
||||
operands[1]));
|
||||
emit_insn (gen_<vec_avx2>_interleave_high<mode> (t[1], operands[2],
|
||||
operands[2]));
|
||||
emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[2], operands[1],
|
||||
operands[1]));
|
||||
emit_insn (gen_<vec_avx2>_interleave_low<mode> (t[3], operands[2],
|
||||
operands[2]));
|
||||
|
||||
/* Multiply words. The end-of-line annotations here give a picture of what
|
||||
the output of that instruction looks like. Dot means don't care; the
|
||||
letters are the bytes of the result with A being the most significant. */
|
||||
emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[4]),
|
||||
gen_rtx_MULT (mulmode, /* .A.B.C.D.E.F.G.H */
|
||||
gen_lowpart (mulmode, t[0]),
|
||||
gen_lowpart (mulmode, t[1]))));
|
||||
emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (mulmode, t[5]),
|
||||
gen_rtx_MULT (mulmode, /* .I.J.K.L.M.N.O.P */
|
||||
gen_lowpart (mulmode, t[2]),
|
||||
gen_lowpart (mulmode, t[3]))));
|
||||
|
||||
/* Extract the even bytes and merge them back together. */
|
||||
if (<MODE>mode == V16QImode)
|
||||
ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0);
|
||||
else
|
||||
{
|
||||
/* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane,
|
||||
this can't be normal even extraction, but one where additionally
|
||||
the second and third quarter are swapped. That is even one insn
|
||||
shorter than even extraction. */
|
||||
rtvec v = rtvec_alloc (32);
|
||||
for (i = 0; i < 32; ++i)
|
||||
RTVEC_ELT (v, i)
|
||||
= GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0));
|
||||
t[0] = operands[0];
|
||||
t[1] = t[5];
|
||||
t[2] = t[4];
|
||||
t[3] = gen_rtx_CONST_VECTOR (<MODE>mode, v);
|
||||
ix86_expand_vec_perm_const (t);
|
||||
}
|
||||
|
||||
set_unique_reg_note (get_last_insn (), REG_EQUAL,
|
||||
gen_rtx_MULT (<MODE>mode, operands[1], operands[2]));
|
||||
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue