mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			i386: Cleanup and unify widening multiply patterns
Prepares for exposing builtin_mul_widen_even/odd hooks for more efficient reduction. Adds QImode multiplication. Shares code between mulv4si3 and the widening multiplies. From-SVN: r188957
This commit is contained in:
		
							parent
							
								
									f008d5dc43
								
							
						
					
					
						commit
						ac3571084f
					
				|  | @ -1,3 +1,28 @@ | |||
| 2012-06-25  Richard Henderson  <rth@redhat.com> | ||||
| 
 | ||||
| 	* config/i386/i386.c (ix86_rtx_costs) [MULT]: Only apply XOP cost | ||||
| 	to V16QImode. | ||||
| 	(ix86_expand_vec_interleave): New. | ||||
| 	(ix86_expand_mul_widen_evenodd): New. | ||||
| 	(ix86_expand_mul_widen_hilo): New. | ||||
| 	(ix86_expand_sse2_mulv4si3): Use ix86_expand_mul_widen_evenodd. | ||||
| 	* config/i386/i386.md (u_bool) New code attr. | ||||
| 	* config/i386/predicates.md | ||||
| 	(nonimmediate_or_const_vector_operand): Remove. | ||||
| 	* config/i386/sse.md (mul<VI4_AVX2>3): Don't use it; don't test | ||||
| 	both AVX and SSE4_1. | ||||
| 	(vec_widen<s>mult_hi_<VI2_AVX2>): Remove. | ||||
| 	(vec_widen<s>mult_lo_<VI2_AVX2>): Remove. | ||||
| 	(vec_widen<s>mult_hi_v8si): Remove. | ||||
| 	(vec_widen<s>mult_lo_v8si): Remove. | ||||
| 	(vec_widen_smult_hi_v4si): Remove. | ||||
| 	(vec_widen_smult_lo_v4si): Remove. | ||||
| 	(vec_widen_umult_hi_v4si): Remove. | ||||
| 	(vec_widen_umult_lo_v4si): Remove. | ||||
| 	(vec_widen_<s>mult_hi_<VI124_AVX2>): New. | ||||
| 	(vec_widen_<s>mult_lo_<VI124_AVX2>): New. | ||||
| 	* config/i386/i386-protos.h: Update. | ||||
| 
 | ||||
| 2012-06-25  Christophe Lyon  <christophe.lyon@st.com> | ||||
| 
 | ||||
| 	* config/arm/neon.md (UNSPEC_VLD1_DUP): Remove. | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| /* Definitions of target machine for GCC for IA-32.
 | ||||
|    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1996, 1997, 1998, 1999, | ||||
|    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 | ||||
|    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 | ||||
|    Free Software Foundation, Inc. | ||||
| 
 | ||||
| This file is part of GCC. | ||||
|  | @ -224,6 +224,8 @@ extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); | |||
| 
 | ||||
| extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); | ||||
| extern bool ix86_expand_pinsr (rtx *); | ||||
| extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool); | ||||
| extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool); | ||||
| extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx); | ||||
| 
 | ||||
| /* In i386-c.c  */ | ||||
|  |  | |||
|  | @ -32101,7 +32101,7 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, | |||
| 	  /* V*QImode is emulated with 1-11 insns.  */ | ||||
| 	  if (mode == V16QImode || mode == V32QImode) | ||||
| 	    { | ||||
| 	      int count; | ||||
| 	      int count = 11; | ||||
| 	      if (TARGET_XOP && mode == V16QImode) | ||||
| 		{ | ||||
| 		  /* For XOP we use vpshab, which requires a broadcast of the
 | ||||
|  | @ -32117,8 +32117,8 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, | |||
| 		    } | ||||
| 		  count = 3; | ||||
| 		} | ||||
| 	      else | ||||
| 		count = TARGET_SSSE3 ? 7 : 11; | ||||
| 	      else if (TARGET_SSSE3) | ||||
| 		count = 7; | ||||
| 	      *total = cost->fabs * count; | ||||
| 	    } | ||||
| 	  else | ||||
|  | @ -32199,7 +32199,11 @@ ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total, | |||
| 	  /* V*QImode is emulated with 7-13 insns.  */ | ||||
| 	  if (mode == V16QImode || mode == V32QImode) | ||||
| 	    { | ||||
| 	      int extra = TARGET_XOP ? 5 : TARGET_SSSE3 ? 6 : 11; | ||||
| 	      int extra = 11; | ||||
| 	      if (TARGET_XOP && mode == V16QImode) | ||||
| 		extra = 5; | ||||
| 	      else if (TARGET_SSSE3) | ||||
| 		extra = 6; | ||||
| 	      *total = cost->fmul * 2 + cost->fabs * extra; | ||||
| 	    } | ||||
| 	  /* Without sse4.1, we don't have PMULLD; it's emulated with 7
 | ||||
|  | @ -38519,6 +38523,34 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) | |||
|   expand_vec_perm_even_odd_1 (&d, odd); | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) | ||||
| { | ||||
|   struct expand_vec_perm_d d; | ||||
|   unsigned i, nelt, base; | ||||
|   bool ok; | ||||
| 
 | ||||
|   d.target = targ; | ||||
|   d.op0 = op0; | ||||
|   d.op1 = op1; | ||||
|   d.vmode = GET_MODE (targ); | ||||
|   d.nelt = nelt = GET_MODE_NUNITS (d.vmode); | ||||
|   d.one_operand_p = false; | ||||
|   d.testing_p = false; | ||||
| 
 | ||||
|   base = high_p ? nelt / 2 : 0; | ||||
|   for (i = 0; i < nelt / 2; ++i) | ||||
|     { | ||||
|       d.perm[i * 2] = i + base; | ||||
|       d.perm[i * 2 + 1] = i + base + nelt; | ||||
|     } | ||||
| 
 | ||||
|   /* Note that for AVX this isn't one instruction.  */ | ||||
|   ok = ix86_expand_vec_perm_const_1 (&d); | ||||
|   gcc_assert (ok); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Expand a vector operation CODE for a V*QImode in terms of the
 | ||||
|    same operation on V*HImode.  */ | ||||
| 
 | ||||
|  | @ -38627,59 +38659,148 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) | |||
| } | ||||
| 
 | ||||
| void | ||||
| ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) | ||||
| ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, | ||||
| 			       bool uns_p, bool odd_p) | ||||
| { | ||||
|   rtx op1_m1, op1_m2; | ||||
|   rtx op2_m1, op2_m2; | ||||
|   rtx res_1, res_2; | ||||
|   enum machine_mode mode = GET_MODE (op1); | ||||
|   rtx x; | ||||
| 
 | ||||
|   /* Shift both input vectors down one element, so that elements 3
 | ||||
|      and 1 are now in the slots for elements 2 and 0.  For K8, at | ||||
|      least, this is faster than using a shuffle.  */ | ||||
|   op1_m1 = op1 = force_reg (V4SImode, op1); | ||||
|   op1_m2 = gen_reg_rtx (V4SImode); | ||||
|   emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op1_m2), | ||||
| 				 gen_lowpart (V1TImode, op1), | ||||
| 				 GEN_INT (32))); | ||||
|   /* We only play even/odd games with vectors of SImode.  */ | ||||
|   gcc_assert (mode == V4SImode || mode == V8SImode); | ||||
| 
 | ||||
|   if (GET_CODE (op2) == CONST_VECTOR) | ||||
|   /* If we're looking for the odd results, shift those members down to
 | ||||
|      the even slots.  For some cpus this is faster than a PSHUFD.  */ | ||||
|   if (odd_p) | ||||
|     { | ||||
|       rtvec v; | ||||
|       enum machine_mode wmode = GET_MODE (dest); | ||||
| 
 | ||||
|       /* Constant propagate the vector shift, leaving the dont-care
 | ||||
| 	 vector elements as zero.  */ | ||||
|       v = rtvec_alloc (4); | ||||
|       RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 0); | ||||
|       RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 2); | ||||
|       RTVEC_ELT (v, 1) = const0_rtx; | ||||
|       RTVEC_ELT (v, 3) = const0_rtx; | ||||
|       op2_m1 = gen_rtx_CONST_VECTOR (V4SImode, v); | ||||
|       op2_m1 = force_reg (V4SImode, op2_m1); | ||||
|       op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), | ||||
| 			  GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, | ||||
| 			  1, OPTAB_DIRECT); | ||||
|       op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), | ||||
| 			  GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL, | ||||
| 			  1, OPTAB_DIRECT); | ||||
|       op1 = gen_lowpart (mode, op1); | ||||
|       op2 = gen_lowpart (mode, op2); | ||||
|     } | ||||
| 
 | ||||
|       v = rtvec_alloc (4); | ||||
|       RTVEC_ELT (v, 0) = CONST_VECTOR_ELT (op2, 1); | ||||
|       RTVEC_ELT (v, 2) = CONST_VECTOR_ELT (op2, 3); | ||||
|       RTVEC_ELT (v, 1) = const0_rtx; | ||||
|       RTVEC_ELT (v, 3) = const0_rtx; | ||||
|       op2_m2 = gen_rtx_CONST_VECTOR (V4SImode, v); | ||||
|       op2_m2 = force_reg (V4SImode, op2_m2); | ||||
|   if (mode == V8SImode) | ||||
|     { | ||||
|       if (uns_p) | ||||
| 	x = gen_avx2_umulv4siv4di3 (dest, op1, op2); | ||||
|       else | ||||
| 	x = gen_avx2_mulv4siv4di3 (dest, op1, op2); | ||||
|     } | ||||
|   else if (uns_p) | ||||
|     x = gen_sse2_umulv2siv2di3 (dest, op1, op2); | ||||
|   else if (TARGET_SSE4_1) | ||||
|     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); | ||||
|   else if (TARGET_XOP) | ||||
|     { | ||||
|       x = force_reg (V2DImode, CONST0_RTX (V2DImode)); | ||||
|       x = gen_xop_pmacsdql (dest, op1, op2, x); | ||||
|     } | ||||
|   else | ||||
|     { | ||||
|       op2_m1 = op2 = force_reg (V4SImode, op2); | ||||
|       op2_m2 = gen_reg_rtx (V4SImode); | ||||
|       emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, op2_m2), | ||||
| 				     gen_lowpart (V1TImode, op2), | ||||
| 				     GEN_INT (32))); | ||||
|     } | ||||
|     gcc_unreachable (); | ||||
|   emit_insn (x); | ||||
| } | ||||
| 
 | ||||
| void | ||||
| ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, | ||||
| 			    bool uns_p, bool high_p) | ||||
| { | ||||
|   enum machine_mode wmode = GET_MODE (dest); | ||||
|   enum machine_mode mode = GET_MODE (op1); | ||||
|   rtx t1, t2, t3, t4, mask; | ||||
| 
 | ||||
|   switch (mode) | ||||
|     { | ||||
|     case V4SImode: | ||||
|       t1 = gen_reg_rtx (mode); | ||||
|       t2 = gen_reg_rtx (mode); | ||||
|       if (TARGET_XOP && !uns_p) | ||||
| 	{ | ||||
| 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
 | ||||
| 	     shuffle the elements once so that all elements are in the right | ||||
| 	     place for immediate use: { A C B D }.  */ | ||||
| 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx, | ||||
| 					const1_rtx, GEN_INT (3))); | ||||
| 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx, | ||||
| 					const1_rtx, GEN_INT (3))); | ||||
| 	} | ||||
|       else | ||||
| 	{ | ||||
| 	  /* Put the elements into place for the multiply.  */ | ||||
| 	  ix86_expand_vec_interleave (t1, op1, op1, high_p); | ||||
| 	  ix86_expand_vec_interleave (t2, op2, op2, high_p); | ||||
| 	  high_p = false; | ||||
| 	} | ||||
|       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p); | ||||
|       break; | ||||
| 
 | ||||
|     case V8SImode: | ||||
|       /* Shuffle the elements between the lanes.  After this we
 | ||||
| 	 have { A B E F | C D G H } for each operand.  */ | ||||
|       t1 = gen_reg_rtx (V4DImode); | ||||
|       t2 = gen_reg_rtx (V4DImode); | ||||
|       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1), | ||||
| 				      const0_rtx, const2_rtx, | ||||
| 				      const1_rtx, GEN_INT (3))); | ||||
|       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2), | ||||
| 				      const0_rtx, const2_rtx, | ||||
| 				      const1_rtx, GEN_INT (3))); | ||||
| 
 | ||||
|       /* Shuffle the elements within the lanes.  After this we
 | ||||
| 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */ | ||||
|       t3 = gen_reg_rtx (V8SImode); | ||||
|       t4 = gen_reg_rtx (V8SImode); | ||||
|       mask = GEN_INT (high_p | ||||
| 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6) | ||||
| 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6)); | ||||
|       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask)); | ||||
|       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask)); | ||||
| 
 | ||||
|       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false); | ||||
|       break; | ||||
| 
 | ||||
|     case V8HImode: | ||||
|     case V16HImode: | ||||
|       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX, | ||||
| 			 uns_p, OPTAB_DIRECT); | ||||
|       t2 = expand_binop (mode, | ||||
| 			 uns_p ? umul_highpart_optab : smul_highpart_optab, | ||||
| 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); | ||||
|       gcc_assert (t1 && t2); | ||||
| 
 | ||||
|       ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p); | ||||
|       break; | ||||
| 
 | ||||
|     case V16QImode: | ||||
|     case V32QImode: | ||||
|       t1 = gen_reg_rtx (wmode); | ||||
|       t2 = gen_reg_rtx (wmode); | ||||
|       ix86_expand_sse_unpack (t1, op1, uns_p, high_p); | ||||
|       ix86_expand_sse_unpack (t2, op2, uns_p, high_p); | ||||
| 
 | ||||
|       emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2))); | ||||
|       break; | ||||
| 
 | ||||
|     default: | ||||
|       gcc_unreachable (); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| void | ||||
| ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) | ||||
| { | ||||
|   rtx res_1, res_2; | ||||
| 
 | ||||
|   /* Widening multiply of elements 0+2, and 1+3.  */ | ||||
|   res_1 = gen_reg_rtx (V4SImode); | ||||
|   res_2 = gen_reg_rtx (V4SImode); | ||||
|   emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_1), | ||||
| 				     op1_m1, op2_m1)); | ||||
|   emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, res_2), | ||||
| 				     op1_m2, op2_m2)); | ||||
|   ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1), | ||||
| 				 op1, op2, true, false); | ||||
|   ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2), | ||||
| 				 op1, op2, true, true); | ||||
| 
 | ||||
|   /* Move the results in element 2 down to element 1; we don't care
 | ||||
|      what goes in elements 2 and 3.  Then we can merge the parts | ||||
|  |  | |||
|  | @ -744,6 +744,7 @@ | |||
| ;; Prefix for define_insn | ||||
| (define_code_attr u [(sign_extend "") (zero_extend "u")]) | ||||
| (define_code_attr s [(sign_extend "s") (zero_extend "u")]) | ||||
| (define_code_attr u_bool [(sign_extend "false") (zero_extend "true")]) | ||||
| 
 | ||||
| ;; All integer modes. | ||||
| (define_mode_iterator SWI1248x [QI HI SI DI]) | ||||
|  |  | |||
|  | @ -816,13 +816,6 @@ | |||
|   return false; | ||||
| }) | ||||
| 
 | ||||
| ;; Return true when OP is a nonimmediate or a vector constant.  Note | ||||
| ;; that most vector constants are not legitimate operands, so we need | ||||
| ;; to special-case this. | ||||
| (define_predicate "nonimmediate_or_const_vector_operand" | ||||
|   (ior (match_code "const_vector") | ||||
|        (match_operand 0 "nonimmediate_operand"))) | ||||
| 
 | ||||
| ;; Return true if OP is a register or a zero. | ||||
| (define_predicate "reg_or_0_operand" | ||||
|   (ior (match_operand 0 "register_operand") | ||||
|  |  | |||
|  | @ -5555,10 +5555,10 @@ | |||
|   [(set (match_operand:VI4_AVX2 0 "register_operand") | ||||
| 	(mult:VI4_AVX2 | ||||
| 	  (match_operand:VI4_AVX2 1 "nonimmediate_operand") | ||||
| 	  (match_operand:VI4_AVX2 2 "nonimmediate_or_const_vector_operand")))] | ||||
| 	  (match_operand:VI4_AVX2 2 "nonimmediate_operand")))] | ||||
|   "TARGET_SSE2" | ||||
| { | ||||
|   if (TARGET_SSE4_1 || TARGET_AVX) | ||||
|   if (TARGET_SSE4_1) | ||||
|     { | ||||
|       if (CONSTANT_P (operands[2])) | ||||
| 	operands[2] = force_const_mem (<MODE>mode, operands[2]); | ||||
|  | @ -5677,198 +5677,28 @@ | |||
| (define_expand "vec_widen_<s>mult_hi_<mode>" | ||||
|   [(match_operand:<sseunpackmode> 0 "register_operand") | ||||
|    (any_extend:<sseunpackmode> | ||||
|      (match_operand:VI2_AVX2 1 "register_operand")) | ||||
|    (match_operand:VI2_AVX2 2 "register_operand")] | ||||
|   "TARGET_SSE2" | ||||
|      (match_operand:VI124_AVX2 1 "register_operand")) | ||||
|    (match_operand:VI124_AVX2 2 "register_operand")] | ||||
|   ; Note that SSE2 does not have signed SI multiply | ||||
|   "TARGET_XOP || TARGET_SSE4_1 | ||||
|    || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" | ||||
| { | ||||
|   rtx op1, op2, t1, t2, dest; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (<MODE>mode); | ||||
|   t2 = gen_reg_rtx (<MODE>mode); | ||||
|   dest = gen_lowpart (<MODE>mode, operands[0]); | ||||
| 
 | ||||
|   emit_insn (gen_mul<mode>3 (t1, op1, op2)); | ||||
|   emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2)); | ||||
|   emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2)); | ||||
|   ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], | ||||
| 			      <u_bool>, true); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_<s>mult_lo_<mode>" | ||||
|   [(match_operand:<sseunpackmode> 0 "register_operand") | ||||
|    (any_extend:<sseunpackmode> | ||||
|      (match_operand:VI2_AVX2 1 "register_operand")) | ||||
|    (match_operand:VI2_AVX2 2 "register_operand")] | ||||
|   "TARGET_SSE2" | ||||
|      (match_operand:VI124_AVX2 1 "register_operand")) | ||||
|    (match_operand:VI124_AVX2 2 "register_operand")] | ||||
|   ; Note that SSE2 does not have signed SI multiply | ||||
|   "TARGET_XOP || TARGET_SSE4_1 | ||||
|    || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))" | ||||
| { | ||||
|   rtx op1, op2, t1, t2, dest; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (<MODE>mode); | ||||
|   t2 = gen_reg_rtx (<MODE>mode); | ||||
|   dest = gen_lowpart (<MODE>mode, operands[0]); | ||||
| 
 | ||||
|   emit_insn (gen_mul<mode>3 (t1, op1, op2)); | ||||
|   emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2)); | ||||
|   emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_<s>mult_hi_v8si" | ||||
|   [(match_operand:V4DI 0 "register_operand") | ||||
|    (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand")) | ||||
|    (match_operand:V8SI 2 "nonimmediate_operand")] | ||||
|   "TARGET_AVX2" | ||||
| { | ||||
|   rtx t1, t2, t3, t4; | ||||
| 
 | ||||
|   t1 = gen_reg_rtx (V4DImode); | ||||
|   t2 = gen_reg_rtx (V4DImode); | ||||
|   t3 = gen_reg_rtx (V8SImode); | ||||
|   t4 = gen_reg_rtx (V8SImode); | ||||
|   emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]), | ||||
| 				  const0_rtx, const2_rtx, | ||||
| 				  const1_rtx, GEN_INT (3))); | ||||
|   emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]), | ||||
| 				  const0_rtx, const2_rtx, | ||||
| 				  const1_rtx, GEN_INT (3))); | ||||
|   emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), | ||||
| 				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6)))); | ||||
|   emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), | ||||
| 				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6)))); | ||||
|   emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_<s>mult_lo_v8si" | ||||
|   [(match_operand:V4DI 0 "register_operand") | ||||
|    (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand")) | ||||
|    (match_operand:V8SI 2 "nonimmediate_operand")] | ||||
|   "TARGET_AVX2" | ||||
| { | ||||
|   rtx t1, t2, t3, t4; | ||||
| 
 | ||||
|   t1 = gen_reg_rtx (V4DImode); | ||||
|   t2 = gen_reg_rtx (V4DImode); | ||||
|   t3 = gen_reg_rtx (V8SImode); | ||||
|   t4 = gen_reg_rtx (V8SImode); | ||||
|   emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]), | ||||
| 				  const0_rtx, const2_rtx, | ||||
| 				  const1_rtx, GEN_INT (3))); | ||||
|   emit_insn (gen_avx2_permv4di_1 (t2,  gen_lowpart (V4DImode, operands[2]), | ||||
| 				  const0_rtx, const2_rtx, | ||||
| 				  const1_rtx, GEN_INT (3))); | ||||
|   emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), | ||||
| 				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6)))); | ||||
|   emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), | ||||
| 				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6)))); | ||||
|   emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_smult_hi_v4si" | ||||
|   [(match_operand:V2DI 0 "register_operand") | ||||
|    (match_operand:V4SI 1 "register_operand") | ||||
|    (match_operand:V4SI 2 "register_operand")] | ||||
|   "TARGET_SSE4_1" | ||||
| { | ||||
|   rtx op1, op2, t1, t2; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (V4SImode); | ||||
|   t2 = gen_reg_rtx (V4SImode); | ||||
| 
 | ||||
|   if (TARGET_XOP) | ||||
|     { | ||||
|       rtx t3 = gen_reg_rtx (V2DImode); | ||||
| 
 | ||||
|       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), | ||||
| 				    GEN_INT (1), GEN_INT (3))); | ||||
|       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), | ||||
| 				    GEN_INT (1), GEN_INT (3))); | ||||
|       emit_move_insn (t3, CONST0_RTX (V2DImode)); | ||||
| 
 | ||||
|       emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3)); | ||||
|       DONE; | ||||
|     } | ||||
| 
 | ||||
|   emit_insn (gen_vec_interleave_highv4si (t1, op1, op1)); | ||||
|   emit_insn (gen_vec_interleave_highv4si (t2, op2, op2)); | ||||
|   emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_smult_lo_v4si" | ||||
|   [(match_operand:V2DI 0 "register_operand") | ||||
|    (match_operand:V4SI 1 "register_operand") | ||||
|    (match_operand:V4SI 2 "register_operand")] | ||||
|   "TARGET_SSE4_1" | ||||
| { | ||||
|   rtx op1, op2, t1, t2; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (V4SImode); | ||||
|   t2 = gen_reg_rtx (V4SImode); | ||||
| 
 | ||||
|   if (TARGET_XOP) | ||||
|     { | ||||
|       rtx t3 = gen_reg_rtx (V2DImode); | ||||
| 
 | ||||
|       emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), | ||||
| 				    GEN_INT (1), GEN_INT (3))); | ||||
|       emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), | ||||
| 				    GEN_INT (1), GEN_INT (3))); | ||||
|       emit_move_insn (t3, CONST0_RTX (V2DImode)); | ||||
| 
 | ||||
|       emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3)); | ||||
|       DONE; | ||||
|     } | ||||
| 
 | ||||
|   emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1)); | ||||
|   emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2)); | ||||
|   emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_umult_hi_v4si" | ||||
|   [(match_operand:V2DI 0 "register_operand") | ||||
|    (match_operand:V4SI 1 "register_operand") | ||||
|    (match_operand:V4SI 2 "register_operand")] | ||||
|   "TARGET_SSE2" | ||||
| { | ||||
|   rtx op1, op2, t1, t2; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (V4SImode); | ||||
|   t2 = gen_reg_rtx (V4SImode); | ||||
| 
 | ||||
|   emit_insn (gen_vec_interleave_highv4si (t1, op1, op1)); | ||||
|   emit_insn (gen_vec_interleave_highv4si (t2, op2, op2)); | ||||
|   emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2)); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
| (define_expand "vec_widen_umult_lo_v4si" | ||||
|   [(match_operand:V2DI 0 "register_operand") | ||||
|    (match_operand:V4SI 1 "register_operand") | ||||
|    (match_operand:V4SI 2 "register_operand")] | ||||
|   "TARGET_SSE2" | ||||
| { | ||||
|   rtx op1, op2, t1, t2; | ||||
| 
 | ||||
|   op1 = operands[1]; | ||||
|   op2 = operands[2]; | ||||
|   t1 = gen_reg_rtx (V4SImode); | ||||
|   t2 = gen_reg_rtx (V4SImode); | ||||
| 
 | ||||
|   emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1)); | ||||
|   emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2)); | ||||
|   emit_insn (gen_sse2_umulv2siv2di3 (operands[0], t1, t2)); | ||||
|   ix86_expand_mul_widen_hilo (operands[0], operands[1], operands[2], | ||||
| 			      <u_bool>, false); | ||||
|   DONE; | ||||
| }) | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Richard Henderson
						Richard Henderson