[AArch64][5/10] ARMv8.2-A FP16 lane vector intrinsics

gcc/
	* config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
	"*aarch64_mulx_elt_from_dup<mode>".
	(*aarch64_mul3_elt<mode>): Update schedule type.
	(*aarch64_mul3_elt_from_dup<mode>): Likewise.
	(*aarch64_fma4_elt_from_dup<mode>): Likewise.
	(*aarch64_fnma4_elt_from_dup<mode>): Likewise.
	* config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
	(f, fp): Support HF modes.
	* config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
	vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
        vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
	vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
	vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
	vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.

From-SVN: r238719
This commit is contained in:
Jiong Wang 2016-07-25 14:49:57 +00:00 committed by Jiong Wang
parent 89ed6d5f5e
commit ab2e8f01f1
4 changed files with 190 additions and 16 deletions

View File

@ -1,3 +1,20 @@
2016-07-25 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
"*aarch64_mulx_elt_from_dup<mode>".
(*aarch64_mul3_elt<mode>): Update schedule type.
(*aarch64_mul3_elt_from_dup<mode>): Likewise.
(*aarch64_fma4_elt_from_dup<mode>): Likewise.
(*aarch64_fnma4_elt_from_dup<mode>): Likewise.
* config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
(f, fp): Support HF modes.
* config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.
2016-07-25 Jiong Wang <jiong.wang@arm.com> 2016-07-25 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd-builtins.def: Register new builtins. * config/aarch64/aarch64-simd-builtins.def: Register new builtins.

View File

@ -351,7 +351,7 @@
operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2]))); operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]"; return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
} }
[(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>" (define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
@ -379,7 +379,7 @@
(match_operand:VMUL 2 "register_operand" "w")))] (match_operand:VMUL 2 "register_operand" "w")))]
"TARGET_SIMD" "TARGET_SIMD"
"<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"; "<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
[(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
) )
(define_insn "aarch64_rsqrte<mode>" (define_insn "aarch64_rsqrte<mode>"
@ -1634,7 +1634,7 @@
(match_operand:VMUL 3 "register_operand" "0")))] (match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD" "TARGET_SIMD"
"fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" "fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
[(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_fma4_elt_to_64v2df" (define_insn "*aarch64_fma4_elt_to_64v2df"
@ -1712,7 +1712,7 @@
(match_operand:VMUL 3 "register_operand" "0")))] (match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD" "TARGET_SIMD"
"fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]" "fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
[(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")] [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
) )
(define_insn "*aarch64_fnma4_elt_to_64v2df" (define_insn "*aarch64_fnma4_elt_to_64v2df"
@ -3101,20 +3101,18 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")] [(set_attr "type" "neon_fp_mul_<Vetype><q>")]
) )
;; vmulxq_lane_f64 ;; vmulxq_lane
(define_insn "*aarch64_mulx_elt_to_64v2df" (define_insn "*aarch64_mulx_elt_from_dup<mode>"
[(set (match_operand:V2DF 0 "register_operand" "=w") [(set (match_operand:VHSDF 0 "register_operand" "=w")
(unspec:V2DF (unspec:VHSDF
[(match_operand:V2DF 1 "register_operand" "w") [(match_operand:VHSDF 1 "register_operand" "w")
(vec_duplicate:V2DF (vec_duplicate:VHSDF
(match_operand:DF 2 "register_operand" "w"))] (match_operand:<VEL> 2 "register_operand" "w"))]
UNSPEC_FMULX))] UNSPEC_FMULX))]
"TARGET_SIMD" "TARGET_SIMD"
{ "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
return "fmulx\t%0.2d, %1.2d, %2.d[0]"; [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
}
[(set_attr "type" "neon_fp_mul_d_scalar_q")]
) )
;; vmulxs_lane_f32, vmulxs_laneq_f32 ;; vmulxs_lane_f32, vmulxs_laneq_f32

View File

@ -26773,6 +26773,160 @@ vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
return __builtin_aarch64_fnmav8hf (__b, __c, __a); return __builtin_aarch64_fnmav8hf (__b, __c, __a);
} }
/* ARMv8.2-A FP16 lane vector intrinsics. */
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
float16x4_t __c, const int __lane)
{
return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
float16x4_t __c, const int __lane)
{
return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
float16x8_t __c, const int __lane)
{
return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
float16x8_t __c, const int __lane)
{
return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
{
return vfma_f16 (__a, __b, vdup_n_f16 (__c));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
{
return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
float16x4_t __c, const int __lane)
{
return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
float16x4_t __c, const int __lane)
{
return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
float16x8_t __c, const int __lane)
{
return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
float16x8_t __c, const int __lane)
{
return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
{
return vfms_f16 (__a, __b, vdup_n_f16 (__c));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
{
return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
{
return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
{
return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
{
return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
{
return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmul_n_f16 (float16x4_t __a, float16_t __b)
{
return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulq_n_f16 (float16x8_t __a, float16_t __b)
{
return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
{
return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
{
return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
{
return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
{
return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
}
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vmulx_n_f16 (float16x4_t __a, float16_t __b)
{
return vmulx_f16 (__a, vdup_n_f16 (__b));
}
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vmulxq_n_f16 (float16x8_t __a, float16_t __b)
{
return vmulxq_f16 (__a, vdupq_n_f16 (__b));
}
#pragma GCC pop_options #pragma GCC pop_options
#undef __aarch64_vget_lane_any #undef __aarch64_vget_lane_any

View File

@ -218,7 +218,10 @@
(define_mode_iterator DX [DI DF]) (define_mode_iterator DX [DI DF])
;; Modes available for <f>mul lane operations. ;; Modes available for <f>mul lane operations.
(define_mode_iterator VMUL [V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) (define_mode_iterator VMUL [V4HI V8HI V2SI V4SI
(V4HF "TARGET_SIMD_F16INST")
(V8HF "TARGET_SIMD_F16INST")
V2SF V4SF V2DF])
;; Modes available for <f>mul lane operations changing lane count. ;; Modes available for <f>mul lane operations changing lane count.
(define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF]) (define_mode_iterator VMUL_CHANGE_NLANES [V4HI V8HI V2SI V4SI V2SF V4SF])
@ -730,6 +733,7 @@
(V4HI "") (V8HI "") (V4HI "") (V8HI "")
(V2SI "") (V4SI "") (V2SI "") (V4SI "")
(DI "") (V2DI "") (DI "") (V2DI "")
(V4HF "f") (V8HF "f")
(V2SF "f") (V4SF "f") (V2SF "f") (V4SF "f")
(V2DF "f") (DF "f")]) (V2DF "f") (DF "f")])
@ -738,6 +742,7 @@
(V4HI "") (V8HI "") (V4HI "") (V8HI "")
(V2SI "") (V4SI "") (V2SI "") (V4SI "")
(DI "") (V2DI "") (DI "") (V2DI "")
(V4HF "_fp") (V8HF "_fp")
(V2SF "_fp") (V4SF "_fp") (V2SF "_fp") (V4SF "_fp")
(V2DF "_fp") (DF "_fp") (V2DF "_fp") (DF "_fp")
(SF "_fp")]) (SF "_fp")])