mirror of git://gcc.gnu.org/git/gcc.git
i386: Cleanup and unify widening multiply patterns
Prepares for exposing builtin_mul_widen_even/odd hooks for more efficient reduction. Adds QImode multiplication. Shares code between mulv4si3 and the widening multiplies. From-SVN: r188957
This commit is contained in:
parent
f008d5dc43
commit
ac3571084f
|
|
@ -1,3 +1,28 @@
|
|||
2012-06-25 Richard Henderson <rth@redhat.com>
|
||||
|
||||
* config/i386/i386.c (ix86_rtx_costs) [MULT]: Only apply XOP cost
|
||||
to V16QImode.
|
||||
(ix86_expand_vec_interleave): New.
|
||||
(ix86_expand_mul_widen_evenodd): New.
|
||||
(ix86_expand_mul_widen_hilo): New.
|
||||
(ix86_expand_sse2_mulv4si3): Use ix86_expand_mul_widen_evenodd.
|
||||
* config/i386/i386.md (u_bool) New code attr.
|
||||
* config/i386/predicates.md
|
||||
(nonimmediate_or_const_vector_operand): Remove.
|
||||
* config/i386/sse.md (mul<VI4_AVX2>3): Don't use it; don't test
|
||||
both AVX and SSE4_1.
|
||||
(vec_widen<s>mult_hi_<VI2_AVX2>): Remove.
|
||||
(vec_widen<s>mult_lo_<VI2_AVX2>): Remove.
|
||||
(vec_widen<s>mult_hi_v8si): Remove.
|
||||
(vec_widen<s>mult_lo_v8si): Remove.
|
||||
(vec_widen_smult_hi_v4si): Remove.
|
||||
(vec_widen_smult_lo_v4si): Remove.
|
||||
(vec_widen_umult_hi_v4si): Remove.
|
||||
(vec_widen_umult_lo_v4si): Remove.
|
||||
(vec_widen_<s>mult_hi_<VI124_AVX2>): New.
|
||||
(vec_widen_<s>mult_lo_<VI124_AVX2>): New.
|
||||
* config/i386/i386-protos.h: Update.
|
||||
|
||||
2012-06-25 Christophe Lyon <christophe.lyon@st.com>
|
||||
|
||||
* config/arm/neon.md (UNSPEC_VLD1_DUP): Remove.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/* Definitions of target machine for GCC for IA-32.
|
||||
Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999,
|
||||
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
|
||||
2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
|
||||
Free Software Foundation, Inc.
|
||||
|
||||
This file is part of GCC.
|
||||
|
|
@ -224,6 +224,8 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
|
|||
|
||||
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
|
||||
extern bool ix86_expand_pinsr (rtx *);
|
||||
extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
|
||||
extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
|
||||
extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
|
||||
|
||||
/* In i386-c.c */
|
||||
|
|
|
|||
|
|
@ -32101,7 +32101,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
|
|||
/* V*QImode is emulated with 1-11 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int count;
|
||||
int count = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
{
|
||||
/* For XOP we use vpshab, which requires a broadcast of the
|
||||
|
|
@ -32117,8 +32117,8 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
|
|||
}
|
||||
count = 3;
|
||||
}
|
||||
else
|
||||
count = TARGET_SSSE3 ? 7 : 11;
|
||||
else if (TARGET_SSSE3)
|
||||
count = 7;
|
||||
*total = cost->fabs * count;
|
||||
}
|
||||
else
|
||||
|
|
@ -32199,7 +32199,11 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
|
|||
/* V*QImode is emulated with 7-13 insns. */
|
||||
if (mode == V16QImode || mode == V32QImode)
|
||||
{
|
||||
int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11;
|
||||
int extra = 11;
|
||||
if (TARGET_XOP && mode == V16QImode)
|
||||
extra = 5;
|
||||
else if (TARGET_SSSE3)
|
||||
extra = 6;
|
||||
*total = cost->fmul * 2 + cost->fabs * extra;
|
||||
}
|
||||
/* Without sse4.1, we don't have PMULLD; it's emulated with 7
|
||||
|
|
@ -38519,6 +38523,34 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
|
|||
expand_vec_perm_even_odd_1 (&d, odd);
|
||||
}
|
||||
|
||||
static void
|
||||
ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
|
||||
{
|
||||
struct expand_vec_perm_d d;
|
||||
unsigned i, nelt, base;
|
||||
bool ok;
|
||||
|
||||
d.target = targ;
|
||||
d.op0 = op0;
|
||||
d.op1 = op1;
|
||||
d.vmode = GET_MODE (targ);
|
||||
d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
|
||||
d.one_operand_p = false;
|
||||
d.testing_p = false;
|
||||
|
||||
base = high_p ? nelt / 2 : 0;
|
||||
for (i = 0; i < nelt / 2; ++i)
|
||||
{
|
||||
d.perm[i * 2] = i + base;
|
||||
d.perm[i * 2 + 1] = i + base + nelt;
|
||||
}
|
||||
|
||||
/* Note that for AVX this isn't one instruction. */
|
||||
ok = ix86_expand_vec_perm_const_1 (&d);
|
||||
gcc_assert (ok);
|
||||
}
|
||||
|
||||
|
||||
/* Expand a vector operation CODE for a V*QImode in terms of the
|
||||
same operation on V*HImode. */
|
||||
|
||||
|
|
@ -38627,59 +38659,148 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
|
|||
}
|
||||
|
||||
void
|
||||
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
|
||||
ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
|
||||
bool uns_p, bool odd_p)
|
||||
{
|
||||
rtx op1_m1, op1_m2;
|
||||
rtx op2_m1, op2_m2;
|
||||
rtx res_1, res_2;
|
||||
enum machine_mode mode = GET_MODE (op1);
|
||||
rtx x;
|
||||
|
||||
/* Shift both input vectors down one element, so that elements 3
|
||||
and 1 are now in the slots for elements 2 and 0. For K8, at
|
||||
least, this is faster than using a shuffle. */
|
||||
op1_m1 = op1 = force_reg (V4SImode, op1);
|
||||
op1_m2 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2),
|
||||
gen_lowpart (V1TImode, op1),
|
||||
GEN_INT (32)));
|
||||
/* We only play even/odd games with vectors of SImode. */
|
||||
gcc_assert (mode == V4SImode || mode == V8SImode);
|
||||
|
||||
if (GET_CODE (op2) == CONST_VECTOR)
|
||||
/* If we're looking for the odd results, shift those members down to
|
||||
the even slots. For some cpus this is faster than a PSHUFD. */
|
||||
if (odd_p)
|
||||
{
|
||||
rtvec v;
|
||||
enum machine_mode wmode = GET_MODE (dest);
|
||||
|
||||
/* Constant propagate the vector shift, leaving the dont-care
|
||||
vector elements as zero. */
|
||||
v = rtvec_alloc (4);
|
||||
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0);
|
||||
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2);
|
||||
RTVEC_ELT (v, 1) = const0_rtx;
|
||||
RTVEC_ELT (v, 3) = const0_rtx;
|
||||
op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v);
|
||||
op2_m1 = force_reg (V4SImode, op2_m1);
|
||||
op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
|
||||
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
|
||||
1, OPTAB_DIRECT);
|
||||
op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
|
||||
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
|
||||
1, OPTAB_DIRECT);
|
||||
op1 = gen_lowpart (mode, op1);
|
||||
op2 = gen_lowpart (mode, op2);
|
||||
}
|
||||
|
||||
v = rtvec_alloc (4);
|
||||
RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1);
|
||||
RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3);
|
||||
RTVEC_ELT (v, 1) = const0_rtx;
|
||||
RTVEC_ELT (v, 3) = const0_rtx;
|
||||
op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v);
|
||||
op2_m2 = force_reg (V4SImode, op2_m2);
|
||||
if (mode == V8SImode)
|
||||
{
|
||||
if (uns_p)
|
||||
x = gen_avx2_umulv4siv4di3 (dest, op1, op2);
|
||||
else
|
||||
x = gen_avx2_mulv4siv4di3 (dest, op1, op2);
|
||||
}
|
||||
else if (uns_p)
|
||||
x = gen_sse2_umulv2siv2di3 (dest, op1, op2);
|
||||
else if (TARGET_SSE4_1)
|
||||
x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
|
||||
else if (TARGET_XOP)
|
||||
{
|
||||
x = force_reg (V2DImode, CONST0_RTX (V2DImode));
|
||||
x = gen_xop_pmacsdql (dest, op1, op2, x);
|
||||
}
|
||||
else
|
||||
{
|
||||
op2_m1 = op2 = force_reg (V4SImode, op2);
|
||||
op2_m2 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2),
|
||||
gen_lowpart (V1TImode, op2),
|
||||
GEN_INT (32)));
|
||||
}
|
||||
gcc_unreachable ();
|
||||
emit_insn (x);
|
||||
}
|
||||
|
||||
void
|
||||
ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
|
||||
bool uns_p, bool high_p)
|
||||
{
|
||||
enum machine_mode wmode = GET_MODE (dest);
|
||||
enum machine_mode mode = GET_MODE (op1);
|
||||
rtx t1, t2, t3, t4, mask;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case V4SImode:
|
||||
t1 = gen_reg_rtx (mode);
|
||||
t2 = gen_reg_rtx (mode);
|
||||
if (TARGET_XOP && !uns_p)
|
||||
{
|
||||
/* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
|
||||
shuffle the elements once so that all elements are in the right
|
||||
place for immediate use: { A C B D }. */
|
||||
emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Put the elements into place for the multiply. */
|
||||
ix86_expand_vec_interleave (t1, op1, op1, high_p);
|
||||
ix86_expand_vec_interleave (t2, op2, op2, high_p);
|
||||
high_p = false;
|
||||
}
|
||||
ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
|
||||
break;
|
||||
|
||||
case V8SImode:
|
||||
/* Shuffle the elements between the lanes. After this we
|
||||
have { A B E F | C D G H } for each operand. */
|
||||
t1 = gen_reg_rtx (V4DImode);
|
||||
t2 = gen_reg_rtx (V4DImode);
|
||||
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
|
||||
/* Shuffle the elements within the lanes. After this we
|
||||
have { A A B B | C C D D } or { E E F F | G G H H }. */
|
||||
t3 = gen_reg_rtx (V8SImode);
|
||||
t4 = gen_reg_rtx (V8SImode);
|
||||
mask = GEN_INT (high_p
|
||||
? 2 + (2 << 2) + (3 << 4) + (3 << 6)
|
||||
: 0 + (0 << 2) + (1 << 4) + (1 << 6));
|
||||
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
|
||||
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
|
||||
|
||||
ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
|
||||
break;
|
||||
|
||||
case V8HImode:
|
||||
case V16HImode:
|
||||
t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
|
||||
uns_p, OPTAB_DIRECT);
|
||||
t2 = expand_binop (mode,
|
||||
uns_p ? umul_highpart_optab : smul_highpart_optab,
|
||||
op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
|
||||
gcc_assert (t1 && t2);
|
||||
|
||||
ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
|
||||
break;
|
||||
|
||||
case V16QImode:
|
||||
case V32QImode:
|
||||
t1 = gen_reg_rtx (wmode);
|
||||
t2 = gen_reg_rtx (wmode);
|
||||
ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
|
||||
ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
|
||||
|
||||
emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
|
||||
break;
|
||||
|
||||
default:
|
||||
gcc_unreachable ();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
|
||||
{
|
||||
rtx res_1, res_2;
|
||||
|
||||
/* Widening multiply of elements 0+2, and 1+3. */
|
||||
res_1 = gen_reg_rtx (V4SImode);
|
||||
res_2 = gen_reg_rtx (V4SImode);
|
||||
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1),
|
||||
op1_m1, op2_m1));
|
||||
emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2),
|
||||
op1_m2, op2_m2));
|
||||
ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
|
||||
op1, op2, true, false);
|
||||
ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
|
||||
op1, op2, true, true);
|
||||
|
||||
/* Move the results in element 2 down to element 1; we don't care
|
||||
what goes in elements 2 and 3. Then we can merge the parts
|
||||
|
|
|
|||
|
|
@ -744,6 +744,7 @@
|
|||
;; Prefix for define_insn
|
||||
(define_code_attr u [(sign_extend "") (zero_extend "u")])
|
||||
(define_code_attr s [(sign_extend "s") (zero_extend "u")])
|
||||
(define_code_attr u_bool [(sign_extend "false") (zero_extend "true")])
|
||||
|
||||
;; All integer modes.
|
||||
(define_mode_iterator SWI1248x [QI HI SI DI])
|
||||
|
|
|
|||
|
|
@ -816,13 +816,6 @@
|
|||
return false;
|
||||
})
|
||||
|
||||
;; Return true when OP is a nonimmediate or a vector constant. Note
|
||||
;; that most vector constants are not legitimate operands, so we need
|
||||
;; to special-case this.
|
||||
(define_predicate "nonimmediate_or_const_vector_operand"
|
||||
(ior (match_code "const_vector")
|
||||
(match_operand 0 "nonimmediate_operand")))
|
||||
|
||||
;; Return true if OP is a register or a zero.
|
||||
(define_predicate "reg_or_0_operand"
|
||||
(ior (match_operand 0 "register_operand")
|
||||
|
|
|
|||
|
|
@ -5555,10 +5555,10 @@
|
|||
[(set (match_operand:VI4_AVX2 0 "register_operand")
|
||||
(mult:VI4_AVX2
|
||||
(match_operand:VI4_AVX2 1 "nonimmediate_operand")
|
||||
(match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))]
|
||||
(match_operand:VI4_AVX2 2 "nonimmediate_operand")))]
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
if (TARGET_SSE4_1 || TARGET_AVX)
|
||||
if (TARGET_SSE4_1)
|
||||
{
|
||||
if (CONSTANT_P (operands[2]))
|
||||
operands[2] = force_const_mem (<MODE>mode, operands[2]);
|
||||
|
|
@ -5677,198 +5677,28 @@
|
|||
(define_expand "vec_widen_<s>mult_hi_<mode>"
|
||||
[(match_operand:<sseunpackmode> 0 "register_operand")
|
||||
(any_extend:<sseunpackmode>
|
||||
(match_operand:VI2_AVX2 1 "register_operand"))
|
||||
(match_operand:VI2_AVX2 2 "register_operand")]
|
||||
"TARGET_SSE2"
|
||||
(match_operand:VI124_AVX2 1 "register_operand"))
|
||||
(match_operand:VI124_AVX2 2 "register_operand")]
|
||||
; Note that SSE2 does not have signed SI multiply
|
||||
"TARGET_XOP || TARGET_SSE4_1
|
||||
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
|
||||
{
|
||||
rtx op1, op2, t1, t2, dest;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (<MODE>mode);
|
||||
t2 = gen_reg_rtx (<MODE>mode);
|
||||
dest = gen_lowpart (<MODE>mode, operands[0]);
|
||||
|
||||
emit_insn (gen_mul<mode>3 (t1, op1, op2));
|
||||
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
|
||||
emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
|
||||
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
|
||||
<u_bool>, true);
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_<s>mult_lo_<mode>"
|
||||
[(match_operand:<sseunpackmode> 0 "register_operand")
|
||||
(any_extend:<sseunpackmode>
|
||||
(match_operand:VI2_AVX2 1 "register_operand"))
|
||||
(match_operand:VI2_AVX2 2 "register_operand")]
|
||||
"TARGET_SSE2"
|
||||
(match_operand:VI124_AVX2 1 "register_operand"))
|
||||
(match_operand:VI124_AVX2 2 "register_operand")]
|
||||
; Note that SSE2 does not have signed SI multiply
|
||||
"TARGET_XOP || TARGET_SSE4_1
|
||||
|| (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
|
||||
{
|
||||
rtx op1, op2, t1, t2, dest;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (<MODE>mode);
|
||||
t2 = gen_reg_rtx (<MODE>mode);
|
||||
dest = gen_lowpart (<MODE>mode, operands[0]);
|
||||
|
||||
emit_insn (gen_mul<mode>3 (t1, op1, op2));
|
||||
emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
|
||||
emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_<s>mult_hi_v8si"
|
||||
[(match_operand:V4DI 0 "register_operand")
|
||||
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
|
||||
(match_operand:V8SI 2 "nonimmediate_operand")]
|
||||
"TARGET_AVX2"
|
||||
{
|
||||
rtx t1, t2, t3, t4;
|
||||
|
||||
t1 = gen_reg_rtx (V4DImode);
|
||||
t2 = gen_reg_rtx (V4DImode);
|
||||
t3 = gen_reg_rtx (V8SImode);
|
||||
t4 = gen_reg_rtx (V8SImode);
|
||||
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
|
||||
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
|
||||
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
|
||||
GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
|
||||
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_<s>mult_lo_v8si"
|
||||
[(match_operand:V4DI 0 "register_operand")
|
||||
(any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand"))
|
||||
(match_operand:V8SI 2 "nonimmediate_operand")]
|
||||
"TARGET_AVX2"
|
||||
{
|
||||
rtx t1, t2, t3, t4;
|
||||
|
||||
t1 = gen_reg_rtx (V4DImode);
|
||||
t2 = gen_reg_rtx (V4DImode);
|
||||
t3 = gen_reg_rtx (V8SImode);
|
||||
t4 = gen_reg_rtx (V8SImode);
|
||||
emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
|
||||
const0_rtx, const2_rtx,
|
||||
const1_rtx, GEN_INT (3)));
|
||||
emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
|
||||
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
|
||||
emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
|
||||
GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
|
||||
emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_smult_hi_v4si"
|
||||
[(match_operand:V2DI 0 "register_operand")
|
||||
(match_operand:V4SI 1 "register_operand")
|
||||
(match_operand:V4SI 2 "register_operand")]
|
||||
"TARGET_SSE4_1"
|
||||
{
|
||||
rtx op1, op2, t1, t2;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
|
||||
if (TARGET_XOP)
|
||||
{
|
||||
rtx t3 = gen_reg_rtx (V2DImode);
|
||||
|
||||
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
|
||||
GEN_INT (1), GEN_INT (3)));
|
||||
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
|
||||
GEN_INT (1), GEN_INT (3)));
|
||||
emit_move_insn (t3, CONST0_RTX (V2DImode));
|
||||
|
||||
emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3));
|
||||
DONE;
|
||||
}
|
||||
|
||||
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
|
||||
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
|
||||
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_smult_lo_v4si"
|
||||
[(match_operand:V2DI 0 "register_operand")
|
||||
(match_operand:V4SI 1 "register_operand")
|
||||
(match_operand:V4SI 2 "register_operand")]
|
||||
"TARGET_SSE4_1"
|
||||
{
|
||||
rtx op1, op2, t1, t2;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
|
||||
if (TARGET_XOP)
|
||||
{
|
||||
rtx t3 = gen_reg_rtx (V2DImode);
|
||||
|
||||
emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
|
||||
GEN_INT (1), GEN_INT (3)));
|
||||
emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
|
||||
GEN_INT (1), GEN_INT (3)));
|
||||
emit_move_insn (t3, CONST0_RTX (V2DImode));
|
||||
|
||||
emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3));
|
||||
DONE;
|
||||
}
|
||||
|
||||
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
|
||||
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
|
||||
emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_umult_hi_v4si"
|
||||
[(match_operand:V2DI 0 "register_operand")
|
||||
(match_operand:V4SI 1 "register_operand")
|
||||
(match_operand:V4SI 2 "register_operand")]
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
rtx op1, op2, t1, t2;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
|
||||
emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
|
||||
emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
|
||||
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
|
||||
DONE;
|
||||
})
|
||||
|
||||
(define_expand "vec_widen_umult_lo_v4si"
|
||||
[(match_operand:V2DI 0 "register_operand")
|
||||
(match_operand:V4SI 1 "register_operand")
|
||||
(match_operand:V4SI 2 "register_operand")]
|
||||
"TARGET_SSE2"
|
||||
{
|
||||
rtx op1, op2, t1, t2;
|
||||
|
||||
op1 = operands[1];
|
||||
op2 = operands[2];
|
||||
t1 = gen_reg_rtx (V4SImode);
|
||||
t2 = gen_reg_rtx (V4SImode);
|
||||
|
||||
emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
|
||||
emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
|
||||
emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2));
|
||||
ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2],
|
||||
<u_bool>, false);
|
||||
DONE;
|
||||
})
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue