[AArch64][5/10] ARMv8.2-A FP16 lane vector intrinsics

gcc/ * config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to "*aarch64_mulx_elt_from_dup<mode>". (*aarch64_mul3_elt<mode>): Update schedule type. (*aarch64_mul3_elt_from_dup<mode>): Likewise. (*aarch64_fma4_elt_from_dup<mode>): Likewise. (*aarch64_fnma4_elt_from_dup<mode>): Likewise. * config/aarch64/iterators.md (VMUL): Supprt half precision float modes. (f, fp): Support HF modes. * config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16, vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16, vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16, vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16, vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16, vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New. From-SVN: r238719
2016-07-25 14:49:57 +00:00 · 2016-07-25 14:49:57 +00:00 · ab2e8f01f1
parent 89ed6d5f5e
commit ab2e8f01f1
4 changed files with 190 additions and 16 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,20 @@
 2016-07-25  Jiong Wang  <jiong.wang@arm.com>
 	* config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
 	"*aarch64_mulx_elt_from_dup<mode>".
 	(*aarch64_mul3_elt<mode>): Update schedule type.
 	(*aarch64_mul3_elt_from_dup<mode>): Likewise.
 	(*aarch64_fma4_elt_from_dup<mode>): Likewise.
 	(*aarch64_fnma4_elt_from_dup<mode>): Likewise.
 	* config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
 	(f, fp): Support HF modes.
 	* config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
 	vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
        vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
 	vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
 	vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
 	vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.
 2016-07-25  Jiong Wang  <jiong.wang@arm.com>
 	* config/aarch64/aarch64-simd-builtins.def: Register new builtins.
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@ -351,7 +351,7 @@
    operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
    return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
  }
-  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 (define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
@ -379,7 +379,7 @@
      (match_operand:VMUL 2 "register_operand" "w")))]
  "TARGET_SIMD"
  "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
-  [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
 )
 (define_insn "aarch64_rsqrte<mode>"
@ -1634,7 +1634,7 @@
      (match_operand:VMUL 3 "register_operand" "0")))]
  "TARGET_SIMD"
  "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
 )
 (define_insn "*aarch64_fma4_elt_to_64v2df"
@ -1712,7 +1712,7 @@
      (match_operand:VMUL 3 "register_operand" "0")))]
  "TARGET_SIMD"
  "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+  [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
 )
 (define_insn "*aarch64_fnma4_elt_to_64v2df"
@ -3101,20 +3101,18 @@
  [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
 )
-;; vmulxq_lane_f64
+;; vmulxq_lane
-(define_insn "*aarch64_mulx_elt_to_64v2df"
+(define_insn "*aarch64_mulx_elt_from_dup<mode>"
-  [(set (match_operand:V2DF 0 "register_operand" "=w")
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
-	(unspec:V2DF
+	(unspec:VHSDF
-	 [(match_operand:V2DF 1 "register_operand" "w")
+	 [(match_operand:VHSDF 1 "register_operand" "w")
-	  (vec_duplicate:V2DF
+	  (vec_duplicate:VHSDF
-	    (match_operand:DF 2 "register_operand" "w"))]
+	    (match_operand:<VEL> 2 "register_operand" "w"))]
 	 UNSPEC_FMULX))]
  "TARGET_SIMD"
-  {
+  "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
-    return "fmulx\t%0.2d, %1.2d, %2.d[0]";
+  [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
  }
  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 ;; vmulxs_lane_f32, vmulxs_laneq_f32
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@ -26773,6 +26773,160 @@ vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
  return __builtin_aarch64_fnmav8hf (__b, __c, __a);
 }
 /* ARMv8.2-A FP16 lane vector intrinsics.  */
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
 	       float16x4_t __c, const int __lane)
 {
  return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
 		float16x4_t __c, const int __lane)
 {
  return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
 		float16x8_t __c, const int __lane)
 {
  return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
 		 float16x8_t __c, const int __lane)
 {
  return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
 {
  return vfma_f16 (__a, __b, vdup_n_f16 (__c));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
 {
  return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
 	       float16x4_t __c, const int __lane)
 {
  return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
 		float16x4_t __c, const int __lane)
 {
  return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
 		float16x8_t __c, const int __lane)
 {
  return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
 		 float16x8_t __c, const int __lane)
 {
  return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
 {
  return vfms_f16 (__a, __b, vdup_n_f16 (__c));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
 {
  return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
 {
  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
 {
  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
 {
  return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
 {
  return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmul_n_f16 (float16x4_t __a, float16_t __b)
 {
  return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulq_n_f16 (float16x8_t __a, float16_t __b)
 {
  return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
 {
  return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
 {
  return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
 {
  return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
 {
  return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
 }
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vmulx_n_f16 (float16x4_t __a, float16_t __b)
 {
  return vmulx_f16 (__a, vdup_n_f16 (__b));
 }
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vmulxq_n_f16 (float16x8_t __a, float16_t __b)
 {
  return vmulxq_f16 (__a, vdupq_n_f16 (__b));
 }
 #pragma GCC pop_options
 #undef __aarch64_vget_lane_any
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@ -218,7 +218,10 @@
 (define_mode_iterator DX [DI DF])
 ;; Modes available for <f>mul lane operations.
-(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
+(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
 			    (V4HF "TARGET_SIMD_F16INST")
 			    (V8HF "TARGET_SIMD_F16INST")
 			    V2SF V4SF V2DF])
 ;; Modes available for <f>mul lane operations changing lane count.
 (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
@ -730,6 +733,7 @@
 		     (V4HI "")  (V8HI  "")
 		     (V2SI "")  (V4SI  "")
 		     (DI   "")  (V2DI  "")
 		     (V4HF "f") (V8HF  "f")
 		     (V2SF "f") (V4SF  "f")
 		     (V2DF "f") (DF    "f")])
@ -738,6 +742,7 @@
 		      (V4HI "")  (V8HI  "")
 		      (V2SI "")  (V4SI  "")
 		      (DI   "")  (V2DI  "")
 		      (V4HF "_fp") (V8HF  "_fp")
 		      (V2SF "_fp") (V4SF  "_fp")
 		      (V2DF "_fp") (DF    "_fp")
 		      (SF "_fp")])