i386: Add and use one_operand_p to vec_perm control struct.

* config/i386/i386.c (struct expand_vec_perm_d): Add one_operand_p.
	(ix86_expand_vector_init_duplicate): Initialize it.
	(expand_vec_perm_palignr): Likewise.
	(ix86_expand_vec_perm_const): Likewise.
	(ix86_vectorize_vec_perm_const_ok): Likewise.
	(expand_vec_perm_blend): Use it.
	(expand_vec_perm_vpermil): Likewise.
	(expand_vec_perm_pshufb): Likewise.
	(expand_vec_perm_1): Likewise.
	(expand_vec_perm_pshuflw_pshufhw): Likewise.
	(expand_vec_perm_interleave2): Likewise.
	(expand_vec_perm_vpermq_perm_1): Likewise.
	(expand_vec_perm_vperm2f128): Likewise.
	(expand_vec_perm_interleave3): Likewise.
	(expand_vec_perm_vperm2f128_vblend): Likewise.
	(expand_vec_perm_vpshufb2_vpermq): Likewise.
	(expand_vec_perm_vpshufb2_vpermq_even_odd): Likewise,.
	(expand_vec_perm_broadcast): Likewise.
	(expand_vec_perm_vpshufb4_vpermq2): Likewise.

From-SVN: r186017
This commit is contained in:
Richard Henderson 2012-03-30 11:00:21 -07:00 committed by Richard Henderson
parent 7b470faf61
commit e025da0c31
2 changed files with 76 additions and 62 deletions

View File

@ -1,3 +1,25 @@
2012-03-30 Richard Henderson <rth@redhat.com>
* config/i386/i386.c (struct expand_vec_perm_d): Add one_operand_p.
(ix86_expand_vector_init_duplicate): Initialize it.
(expand_vec_perm_palignr): Likewise.
(ix86_expand_vec_perm_const): Likewise.
(ix86_vectorize_vec_perm_const_ok): Likewise.
(expand_vec_perm_blend): Use it.
(expand_vec_perm_vpermil): Likewise.
(expand_vec_perm_pshufb): Likewise.
(expand_vec_perm_1): Likewise.
(expand_vec_perm_pshuflw_pshufhw): Likewise.
(expand_vec_perm_interleave2): Likewise.
(expand_vec_perm_vpermq_perm_1): Likewise.
(expand_vec_perm_vperm2f128): Likewise.
(expand_vec_perm_interleave3): Likewise.
(expand_vec_perm_vperm2f128_vblend): Likewise.
(expand_vec_perm_vpshufb2_vpermq): Likewise.
(expand_vec_perm_vpshufb2_vpermq_even_odd): Likewise,.
(expand_vec_perm_broadcast): Likewise.
(expand_vec_perm_vpshufb4_vpermq2): Likewise.
2012-03-30 Richard Henderson <rth@redhat.com> 2012-03-30 Richard Henderson <rth@redhat.com>
* dwarf2out.c (gen_variable_die): Initialize off. * dwarf2out.c (gen_variable_die): Initialize off.

View File

@ -32938,6 +32938,7 @@ struct expand_vec_perm_d
unsigned char perm[MAX_VECT_LEN]; unsigned char perm[MAX_VECT_LEN];
enum machine_mode vmode; enum machine_mode vmode;
unsigned char nelt; unsigned char nelt;
bool one_operand_p;
bool testing_p; bool testing_p;
}; };
@ -33038,6 +33039,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
dperm.vmode = mode; dperm.vmode = mode;
dperm.nelt = GET_MODE_NUNITS (mode); dperm.nelt = GET_MODE_NUNITS (mode);
dperm.op0 = dperm.op1 = gen_reg_rtx (mode); dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
dperm.one_operand_p = true;
/* Extend to SImode using a paradoxical SUBREG. */ /* Extend to SImode using a paradoxical SUBREG. */
tmp1 = gen_reg_rtx (SImode); tmp1 = gen_reg_rtx (SImode);
@ -35735,7 +35737,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
rtx target, op0, op1, x; rtx target, op0, op1, x;
rtx rperm[32], vperm; rtx rperm[32], vperm;
if (d->op0 == d->op1) if (d->one_operand_p)
return false; return false;
if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
; ;
@ -35922,7 +35924,7 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
rtx rperm[8], vperm; rtx rperm[8], vperm;
unsigned i; unsigned i;
if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1) if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
return false; return false;
/* We can only permute within the 128-bit lane. */ /* We can only permute within the 128-bit lane. */
@ -35998,7 +36000,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
nelt = d->nelt; nelt = d->nelt;
if (d->op0 != d->op1) if (!d->one_operand_p)
{ {
if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
{ {
@ -36086,7 +36088,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
else else
{ {
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
if (d->op0 != d->op1) if (!d->one_operand_p)
mask = 2 * nelt - 1; mask = 2 * nelt - 1;
else if (vmode == V16QImode) else if (vmode == V16QImode)
mask = nelt - 1; mask = nelt - 1;
@ -36113,7 +36115,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
target = gen_lowpart (vmode, d->target); target = gen_lowpart (vmode, d->target);
op0 = gen_lowpart (vmode, d->op0); op0 = gen_lowpart (vmode, d->op0);
if (d->op0 == d->op1) if (d->one_operand_p)
{ {
if (vmode == V16QImode) if (vmode == V16QImode)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
@ -36145,7 +36147,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
/* Check plain VEC_SELECT first, because AVX has instructions that could /* Check plain VEC_SELECT first, because AVX has instructions that could
match both SEL and SEL+CONCAT, but the plain SEL will allow a memory match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
input where SEL+CONCAT may not. */ input where SEL+CONCAT may not. */
if (d->op0 == d->op1) if (d->one_operand_p)
{ {
int mask = nelt - 1; int mask = nelt - 1;
bool identity_perm = true; bool identity_perm = true;
@ -36242,7 +36244,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return true; return true;
/* Recognize interleave style patterns with reversed operands. */ /* Recognize interleave style patterns with reversed operands. */
if (d->op0 != d->op1) if (!d->one_operand_p)
{ {
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)
{ {
@ -36285,7 +36287,7 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
unsigned i; unsigned i;
bool ok; bool ok;
if (d->vmode != V8HImode || d->op0 != d->op1) if (d->vmode != V8HImode || !d->one_operand_p)
return false; return false;
/* The two permutations only operate in 64-bit lanes. */ /* The two permutations only operate in 64-bit lanes. */
@ -36357,6 +36359,7 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
gen_lowpart (TImode, d->op0), shift)); gen_lowpart (TImode, d->op0), shift));
d->op0 = d->op1 = d->target; d->op0 = d->op1 = d->target;
d->one_operand_p = true;
in_order = true; in_order = true;
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)
@ -36396,14 +36399,14 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
if (GET_MODE_SIZE (d->vmode) == 16) if (GET_MODE_SIZE (d->vmode) == 16)
{ {
if (d->op0 == d->op1) if (d->one_operand_p)
return false; return false;
} }
else if (GET_MODE_SIZE (d->vmode) == 32) else if (GET_MODE_SIZE (d->vmode) == 32)
{ {
if (!TARGET_AVX) if (!TARGET_AVX)
return false; return false;
/* For 32-byte modes allow even d->op0 == d->op1. /* For 32-byte modes allow even d->one_operand_p.
The lack of cross-lane shuffling in some instructions The lack of cross-lane shuffling in some instructions
might prevent a single insn shuffle. */ might prevent a single insn shuffle. */
dfinal = *d; dfinal = *d;
@ -36528,11 +36531,11 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
if (nzcnt == 1) if (nzcnt == 1)
{ {
gcc_assert (d->op0 == d->op1); gcc_assert (d->one_operand_p);
nonzero_halves[1] = nonzero_halves[0]; nonzero_halves[1] = nonzero_halves[0];
same_halves = true; same_halves = true;
} }
else if (d->op0 == d->op1) else if (d->one_operand_p)
{ {
gcc_assert (nonzero_halves[0] == 0); gcc_assert (nonzero_halves[0] == 0);
gcc_assert (nonzero_halves[1] == 1); gcc_assert (nonzero_halves[1] == 1);
@ -36571,7 +36574,7 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
} }
} }
} }
else if (d->op0 == d->op1) else if (d->one_operand_p)
return false; return false;
else if (TARGET_AVX2 else if (TARGET_AVX2
&& (contents & (q[0] | q[2] | q[4] | q[6])) == contents) && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
@ -36628,6 +36631,7 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
} }
dfinal.op0 = gen_reg_rtx (dfinal.vmode); dfinal.op0 = gen_reg_rtx (dfinal.vmode);
dfinal.op1 = dfinal.op0; dfinal.op1 = dfinal.op0;
dfinal.one_operand_p = true;
dremap.target = dfinal.op0; dremap.target = dfinal.op0;
/* Test if the final remap can be done with a single insn. For V4SFmode or /* Test if the final remap can be done with a single insn. For V4SFmode or
@ -36671,7 +36675,7 @@ expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
if (!(TARGET_AVX2 if (!(TARGET_AVX2
&& (d->vmode == V32QImode || d->vmode == V16HImode) && (d->vmode == V32QImode || d->vmode == V16HImode)
&& d->op0 == d->op1)) && d->one_operand_p))
return false; return false;
contents[0] = 0; contents[0] = 0;
@ -36699,6 +36703,7 @@ expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
dremap.target = gen_reg_rtx (V4DImode); dremap.target = gen_reg_rtx (V4DImode);
dremap.op0 = gen_lowpart (V4DImode, d->op0); dremap.op0 = gen_lowpart (V4DImode, d->op0);
dremap.op1 = dremap.op0; dremap.op1 = dremap.op0;
dremap.one_operand_p = true;
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
unsigned int cnt = 0; unsigned int cnt = 0;
@ -36712,6 +36717,7 @@ expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
dfinal = *d; dfinal = *d;
dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
dfinal.op1 = dfinal.op0; dfinal.op1 = dfinal.op0;
dfinal.one_operand_p = true;
for (i = 0, j = 0; i < nelt; ++i) for (i = 0, j = 0; i < nelt; ++i)
{ {
if (i == nelt2) if (i == nelt2)
@ -36751,8 +36757,7 @@ expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
return false; return false;
dsecond = *d; dsecond = *d;
if (d->op0 == d->op1) dsecond.one_operand_p = false;
dsecond.op1 = gen_reg_rtx (d->vmode);
dsecond.testing_p = true; dsecond.testing_p = true;
/* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
@ -36821,10 +36826,7 @@ expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
vperm2f128 on d->op0 and d->op1. */ vperm2f128 on d->op0 and d->op1. */
dsecond.testing_p = false; dsecond.testing_p = false;
dfirst = *d; dfirst = *d;
if (d->op0 == d->op1) dfirst.target = gen_reg_rtx (d->vmode);
dfirst.target = dsecond.op1;
else
dfirst.target = gen_reg_rtx (d->vmode);
for (i = 0; i < nelt; i++) for (i = 0; i < nelt; i++)
dfirst.perm[i] = (i & (nelt2 - 1)) dfirst.perm[i] = (i & (nelt2 - 1))
+ ((perm >> (2 * (i >= nelt2))) & 3) * nelt2; + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
@ -36845,9 +36847,8 @@ expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
return true; return true;
} }
/* For d->op0 == d->op1 the only useful vperm2f128 permutation /* For one operand, the only useful vperm2f128 permutation is 0x10. */
is 0x10. */ if (d->one_operand_p)
if (d->op0 == d->op1)
return false; return false;
} }
@ -36864,7 +36865,7 @@ expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
unsigned i, nelt; unsigned i, nelt;
rtx (*gen) (rtx, rtx, rtx); rtx (*gen) (rtx, rtx, rtx);
if (d->op0 == d->op1) if (d->one_operand_p)
return false; return false;
if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
; ;
@ -36947,7 +36948,7 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
if (!TARGET_AVX if (!TARGET_AVX
|| TARGET_AVX2 || TARGET_AVX2
|| (d->vmode != V8SFmode && d->vmode != V4DFmode) || (d->vmode != V8SFmode && d->vmode != V4DFmode)
|| d->op0 != d->op1) || !d->one_operand_p)
return false; return false;
dfirst = *d; dfirst = *d;
@ -36985,6 +36986,7 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
dsecond = *d; dsecond = *d;
dsecond.op0 = dfirst.target; dsecond.op0 = dfirst.target;
dsecond.op1 = dfirst.target; dsecond.op1 = dfirst.target;
dsecond.one_operand_p = true;
dsecond.target = gen_reg_rtx (dsecond.vmode); dsecond.target = gen_reg_rtx (dsecond.vmode);
for (i = 0; i < nelt; i++) for (i = 0; i < nelt; i++)
dsecond.perm[i] = i ^ nelt2; dsecond.perm[i] = i ^ nelt2;
@ -37009,7 +37011,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
return false; return false;
gcc_assert (d->op0 != d->op1); gcc_assert (!d->one_operand_p);
nelt = d->nelt; nelt = d->nelt;
eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
@ -37064,7 +37066,7 @@ expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
unsigned int i, nelt, eltsz; unsigned int i, nelt, eltsz;
if (!TARGET_AVX2 if (!TARGET_AVX2
|| d->op0 != d->op1 || !d->one_operand_p
|| (d->vmode != V32QImode && d->vmode != V16HImode)) || (d->vmode != V32QImode && d->vmode != V16HImode))
return false; return false;
@ -37132,7 +37134,7 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
unsigned int i, nelt, eltsz; unsigned int i, nelt, eltsz;
if (!TARGET_AVX2 if (!TARGET_AVX2
|| d->op0 == d->op1 || d->one_operand_p
|| (d->vmode != V32QImode && d->vmode != V16HImode)) || (d->vmode != V32QImode && d->vmode != V16HImode))
return false; return false;
@ -37491,7 +37493,7 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
{ {
unsigned i, elt, nelt = d->nelt; unsigned i, elt, nelt = d->nelt;
if (d->op0 != d->op1) if (!d->one_operand_p)
return false; return false;
elt = d->perm[0]; elt = d->perm[0];
@ -37514,7 +37516,7 @@ expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
bool used[4]; bool used[4];
if (!TARGET_AVX2 if (!TARGET_AVX2
|| d->op0 == d->op1 || d->one_operand_p
|| (d->vmode != V32QImode && d->vmode != V16HImode)) || (d->vmode != V32QImode && d->vmode != V16HImode))
return false; return false;
@ -37715,6 +37717,7 @@ ix86_expand_vec_perm_const (rtx operands[4])
perm[i] = ei; perm[i] = ei;
} }
d.one_operand_p = true;
switch (which) switch (which)
{ {
default: default:
@ -37722,51 +37725,39 @@ ix86_expand_vec_perm_const (rtx operands[4])
case 3: case 3:
if (!rtx_equal_p (d.op0, d.op1)) if (!rtx_equal_p (d.op0, d.op1))
break; {
d.one_operand_p = false;
break;
}
/* The elements of PERM do not suggest that only the first operand /* The elements of PERM do not suggest that only the first operand
is used, but both operands are identical. Allow easier matching is used, but both operands are identical. Allow easier matching
of the permutation by folding the permutation into the single of the permutation by folding the permutation into the single
input vector. */ input vector. */
for (i = 0; i < nelt; ++i)
if (d.perm[i] >= nelt)
d.perm[i] -= nelt;
/* FALLTHRU */ /* FALLTHRU */
case 1:
d.op1 = d.op0;
break;
case 2: case 2:
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)
d.perm[i] -= nelt; d.perm[i] &= nelt - 1;
d.op0 = d.op1; d.op0 = d.op1;
break; break;
case 1:
d.op1 = d.op0;
break;
} }
if (ix86_expand_vec_perm_const_1 (&d)) if (ix86_expand_vec_perm_const_1 (&d))
return true; return true;
/* If the mask says both arguments are needed, but they are the same, /* If the selector says both arguments are needed, but the operands are the
the above tried to expand with d.op0 == d.op1. If that didn't work, same, the above tried to expand with one_operand_p and flattened selector.
retry with d.op0 != d.op1 as that is what testing has been done with. */ If that didn't work, retry without one_operand_p; we succeeded with that
if (which == 3 && d.op0 == d.op1) during testing. */
if (which == 3 && d.one_operand_p)
{ {
rtx seq; d.one_operand_p = false;
bool ok;
memcpy (d.perm, perm, sizeof (perm)); memcpy (d.perm, perm, sizeof (perm));
d.op1 = gen_reg_rtx (d.vmode); return ix86_expand_vec_perm_const_1 (&d);
start_sequence ();
ok = ix86_expand_vec_perm_const_1 (&d);
seq = get_insns ();
end_sequence ();
if (ok)
{
emit_move_insn (d.op1, d.op0);
emit_insn (seq);
return true;
}
} }
return false; return false;
@ -37780,7 +37771,7 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
{ {
struct expand_vec_perm_d d; struct expand_vec_perm_d d;
unsigned int i, nelt, which; unsigned int i, nelt, which;
bool ret, one_vec; bool ret;
d.vmode = vmode; d.vmode = vmode;
d.nelt = nelt = GET_MODE_NUNITS (d.vmode); d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
@ -37817,17 +37808,17 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
d.perm[i] -= nelt; d.perm[i] -= nelt;
/* Check whether the mask can be applied to the vector type. */ /* Check whether the mask can be applied to the vector type. */
one_vec = (which != 3); d.one_operand_p = (which != 3);
/* Implementable with shufps or pshufd. */ /* Implementable with shufps or pshufd. */
if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode)) if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
return true; return true;
/* Otherwise we have to go through the motions and see if we can /* Otherwise we have to go through the motions and see if we can
figure out how to generate the requested permutation. */ figure out how to generate the requested permutation. */
d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
if (!one_vec) if (!d.one_operand_p)
d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
start_sequence (); start_sequence ();
@ -37848,6 +37839,7 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
d.op1 = op1; d.op1 = op1;
d.vmode = GET_MODE (targ); d.vmode = GET_MODE (targ);
d.nelt = nelt = GET_MODE_NUNITS (d.vmode); d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
d.one_operand_p = false;
d.testing_p = false; d.testing_p = false;
for (i = 0; i < nelt; ++i) for (i = 0; i < nelt; ++i)