re PR libfortran/78379 (Processor-specific versions for matmul)

2017-05-25  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR libfortran/78379
	* Makefile.am: Add generated/matmulavx128_*.c files.
	Handle them for compiling and setting the right flags.
	* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
	* configure.ac: Call them.
	* Makefile.in: Regenerated.
	* config.h.in: Regenerated.
	* configure: Regenerated.
	* m4/matmul.m4:  Handle AMD chips by calling 128-bit AVX
	versions which use FMA3 or FMA4.
	* m4/matmulavx128.m4: New file.
        * generated/matmul_c10.c: Regenerated.
        * generated/matmul_c16.c: Regenerated.
        * generated/matmul_c4.c: Regenerated.
        * generated/matmul_c8.c: Regenerated.
        * generated/matmul_i1.c: Regenerated.
        * generated/matmul_i16.c: Regenerated.
        * generated/matmul_i2.c: Regenerated.
        * generated/matmul_i4.c: Regenerated.
        * generated/matmul_i8.c: Regenerated.
        * generated/matmul_r10.c: Regenerated.
        * generated/matmul_r16.c: Regenerated.
        * generated/matmul_r4.c: Regenerated.
        * generated/matmul_r8.c: Regenerated.
        * generated/matmulavx128_c10.c: New file.
        * generated/matmulavx128_c16.c: New file.
        * generated/matmulavx128_c4.c: New file.
        * generated/matmulavx128_c8.c: New file.
        * generated/matmulavx128_i1.c: New file.
        * generated/matmulavx128_i16.c: New file.
        * generated/matmulavx128_i2.c: New file.
        * generated/matmulavx128_i4.c: New file.
        * generated/matmulavx128_i8.c: New file.
        * generated/matmulavx128_r10.c: New file.
        * generated/matmulavx128_r16.c: New file.
        * generated/matmulavx128_r4.c: New file.
        * generated/matmulavx128_r8.c: New file.

From-SVN: r248472
This commit is contained in:
Thomas Koenig 2017-05-25 21:51:27 +00:00
parent 87e1e6036e
commit 1d5cf7fcf2
35 changed files with 15964 additions and 27 deletions

View File

@ -1,3 +1,43 @@
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
PR libfortran/78379
* Makefile.am: Add generated/matmulavx128_*.c files.
Handle them for compiling and setting the right flags.
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
* configure.ac: Call them.
* Makefile.in: Regenerated.
* config.h.in: Regenerated.
* configure: Regenerated.
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
versions which use FMA3 or FMA4.
* m4/matmulavx128.m4: New file.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
* generated/matmulavx128_c10.c: New file.
* generated/matmulavx128_c16.c: New file.
* generated/matmulavx128_c4.c: New file.
* generated/matmulavx128_c8.c: New file.
* generated/matmulavx128_i1.c: New file.
* generated/matmulavx128_i16.c: New file.
* generated/matmulavx128_i2.c: New file.
* generated/matmulavx128_i4.c: New file.
* generated/matmulavx128_i8.c: New file.
* generated/matmulavx128_r10.c: New file.
* generated/matmulavx128_r16.c: New file.
* generated/matmulavx128_r4.c: New file.
* generated/matmulavx128_r8.c: New file.
2017-05-19 Paul Thomas <pault@gcc.gnu.org>
Jerry DeLisle <jvdelisle@gcc.gnu.org>
@ -14,7 +54,7 @@
(st_endfile): Likewise.
(st_rewind): Likewise.
(st_flush): Likewise.
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
PR libgfortran/80727

View File

@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
$(srcdir)/generated/matmul_c10.c \
$(srcdir)/generated/matmul_c16.c
i_matmulavx128_c= \
$(srcdir)/generated/matmulavx128_i1.c \
$(srcdir)/generated/matmulavx128_i2.c \
$(srcdir)/generated/matmulavx128_i4.c \
$(srcdir)/generated/matmulavx128_i8.c \
$(srcdir)/generated/matmulavx128_i16.c \
$(srcdir)/generated/matmulavx128_r4.c \
$(srcdir)/generated/matmulavx128_r8.c \
$(srcdir)/generated/matmulavx128_r10.c \
$(srcdir)/generated/matmulavx128_r16.c \
$(srcdir)/generated/matmulavx128_c4.c \
$(srcdir)/generated/matmulavx128_c8.c \
$(srcdir)/generated/matmulavx128_c10.c \
$(srcdir)/generated/matmulavx128_c16.c
i_matmull_c= \
$(srcdir)/generated/matmul_l4.c \
$(srcdir)/generated/matmul_l8.c \
@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
intrinsics/f2c_specifics.F90
# Turn on vectorization and loop unrolling for matmul.
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
if HAVE_AVX128
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
endif
# Logical matmul doesn't vectorize.
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@

View File

@ -289,15 +289,20 @@ am__objects_32 = unpack_i1.lo unpack_i2.lo unpack_i4.lo unpack_i8.lo \
unpack_i16.lo unpack_r4.lo unpack_r8.lo unpack_r10.lo \
unpack_r16.lo unpack_c4.lo unpack_c8.lo unpack_c10.lo \
unpack_c16.lo
am__objects_33 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
am__objects_33 = matmulavx128_i1.lo matmulavx128_i2.lo \
matmulavx128_i4.lo matmulavx128_i8.lo matmulavx128_i16.lo \
matmulavx128_r4.lo matmulavx128_r8.lo matmulavx128_r10.lo \
matmulavx128_r16.lo matmulavx128_c4.lo matmulavx128_c8.lo \
matmulavx128_c10.lo matmulavx128_c16.lo
am__objects_34 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
spread_i16.lo spread_r4.lo spread_r8.lo spread_r10.lo \
spread_r16.lo spread_c4.lo spread_c8.lo spread_c10.lo \
spread_c16.lo
am__objects_34 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
am__objects_35 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
cshift0_i8.lo cshift0_i16.lo cshift0_r4.lo cshift0_r8.lo \
cshift0_r10.lo cshift0_r16.lo cshift0_c4.lo cshift0_c8.lo \
cshift0_c10.lo cshift0_c16.lo
am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
am__objects_36 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
$(am__objects_7) $(am__objects_8) $(am__objects_9) \
$(am__objects_10) $(am__objects_11) $(am__objects_12) \
$(am__objects_13) $(am__objects_14) $(am__objects_15) \
@ -307,14 +312,14 @@ am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
$(am__objects_25) $(am__objects_26) $(am__objects_27) \
$(am__objects_28) $(am__objects_29) $(am__objects_30) \
$(am__objects_31) $(am__objects_32) $(am__objects_33) \
$(am__objects_34)
@LIBGFOR_MINIMAL_FALSE@am__objects_36 = close.lo file_pos.lo format.lo \
$(am__objects_34) $(am__objects_35)
@LIBGFOR_MINIMAL_FALSE@am__objects_37 = close.lo file_pos.lo format.lo \
@LIBGFOR_MINIMAL_FALSE@ inquire.lo intrinsics.lo list_read.lo \
@LIBGFOR_MINIMAL_FALSE@ lock.lo open.lo read.lo transfer.lo \
@LIBGFOR_MINIMAL_FALSE@ transfer128.lo unit.lo unix.lo write.lo \
@LIBGFOR_MINIMAL_FALSE@ fbuf.lo
am__objects_37 = size_from_kind.lo $(am__objects_36)
@LIBGFOR_MINIMAL_FALSE@am__objects_38 = access.lo c99_functions.lo \
am__objects_38 = size_from_kind.lo $(am__objects_37)
@LIBGFOR_MINIMAL_FALSE@am__objects_39 = access.lo c99_functions.lo \
@LIBGFOR_MINIMAL_FALSE@ chdir.lo chmod.lo clock.lo cpu_time.lo \
@LIBGFOR_MINIMAL_FALSE@ ctime.lo date_and_time.lo dtime.lo \
@LIBGFOR_MINIMAL_FALSE@ env.lo etime.lo execute_command_line.lo \
@ -324,19 +329,19 @@ am__objects_37 = size_from_kind.lo $(am__objects_36)
@LIBGFOR_MINIMAL_FALSE@ rename.lo stat.lo symlnk.lo \
@LIBGFOR_MINIMAL_FALSE@ system_clock.lo time.lo umask.lo \
@LIBGFOR_MINIMAL_FALSE@ unlink.lo
@IEEE_SUPPORT_TRUE@am__objects_39 = ieee_helper.lo
am__objects_40 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
@IEEE_SUPPORT_TRUE@am__objects_40 = ieee_helper.lo
am__objects_41 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
eoshift2.lo erfc_scaled.lo extends_type_of.lo fnum.lo \
ierrno.lo ishftc.lo mvbits.lo move_alloc.lo pack_generic.lo \
selected_char_kind.lo size.lo spread_generic.lo \
string_intrinsics.lo rand.lo random.lo reshape_generic.lo \
reshape_packed.lo selected_int_kind.lo selected_real_kind.lo \
unpack_generic.lo in_pack_generic.lo in_unpack_generic.lo \
$(am__objects_38) $(am__objects_39)
@IEEE_SUPPORT_TRUE@am__objects_41 = ieee_arithmetic.lo \
$(am__objects_39) $(am__objects_40)
@IEEE_SUPPORT_TRUE@am__objects_42 = ieee_arithmetic.lo \
@IEEE_SUPPORT_TRUE@ ieee_exceptions.lo ieee_features.lo
am__objects_42 =
am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
am__objects_43 =
am__objects_44 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
_abs_i4.lo _abs_i8.lo _abs_i16.lo _abs_r4.lo _abs_r8.lo \
_abs_r10.lo _abs_r16.lo _aimag_c4.lo _aimag_c8.lo \
_aimag_c10.lo _aimag_c16.lo _exp_r4.lo _exp_r8.lo _exp_r10.lo \
@ -360,19 +365,19 @@ am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
_conjg_c4.lo _conjg_c8.lo _conjg_c10.lo _conjg_c16.lo \
_aint_r4.lo _aint_r8.lo _aint_r10.lo _aint_r16.lo _anint_r4.lo \
_anint_r8.lo _anint_r10.lo _anint_r16.lo
am__objects_44 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
am__objects_45 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
_sign_r8.lo _sign_r10.lo _sign_r16.lo _dim_i4.lo _dim_i8.lo \
_dim_i16.lo _dim_r4.lo _dim_r8.lo _dim_r10.lo _dim_r16.lo \
_atan2_r4.lo _atan2_r8.lo _atan2_r10.lo _atan2_r16.lo \
_mod_i4.lo _mod_i8.lo _mod_i16.lo _mod_r4.lo _mod_r8.lo \
_mod_r10.lo _mod_r16.lo
am__objects_45 = misc_specifics.lo
am__objects_46 = $(am__objects_43) $(am__objects_44) $(am__objects_45) \
am__objects_46 = misc_specifics.lo
am__objects_47 = $(am__objects_44) $(am__objects_45) $(am__objects_46) \
dprod_r8.lo f2c_specifics.lo
am__objects_47 = $(am__objects_3) $(am__objects_35) $(am__objects_37) \
$(am__objects_40) $(am__objects_41) $(am__objects_42) \
$(am__objects_46)
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_47)
am__objects_48 = $(am__objects_3) $(am__objects_36) $(am__objects_38) \
$(am__objects_41) $(am__objects_42) $(am__objects_43) \
$(am__objects_47)
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_48)
@onestep_TRUE@am_libgfortran_la_OBJECTS = libgfortran_c.lo
libgfortran_la_OBJECTS = $(am_libgfortran_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@
@ -879,6 +884,21 @@ $(srcdir)/generated/matmul_c8.c \
$(srcdir)/generated/matmul_c10.c \
$(srcdir)/generated/matmul_c16.c
i_matmulavx128_c = \
$(srcdir)/generated/matmulavx128_i1.c \
$(srcdir)/generated/matmulavx128_i2.c \
$(srcdir)/generated/matmulavx128_i4.c \
$(srcdir)/generated/matmulavx128_i8.c \
$(srcdir)/generated/matmulavx128_i16.c \
$(srcdir)/generated/matmulavx128_r4.c \
$(srcdir)/generated/matmulavx128_r8.c \
$(srcdir)/generated/matmulavx128_r10.c \
$(srcdir)/generated/matmulavx128_r16.c \
$(srcdir)/generated/matmulavx128_c4.c \
$(srcdir)/generated/matmulavx128_c8.c \
$(srcdir)/generated/matmulavx128_c10.c \
$(srcdir)/generated/matmulavx128_c16.c
i_matmull_c = \
$(srcdir)/generated/matmul_l4.c \
$(srcdir)/generated/matmul_l8.c \
@ -1059,7 +1079,7 @@ gfor_built_src = $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
@ -1518,6 +1538,19 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c10.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i1.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i2.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r10.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i1.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i2.Plo@am__quote@
@ -4584,6 +4617,97 @@ unpack_c16.lo: $(srcdir)/generated/unpack_c16.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unpack_c16.lo `test -f '$(srcdir)/generated/unpack_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/unpack_c16.c
matmulavx128_i1.lo: $(srcdir)/generated/matmulavx128_i1.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i1.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i1.Tpo -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i1.Tpo $(DEPDIR)/matmulavx128_i1.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i1.c' object='matmulavx128_i1.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
matmulavx128_i2.lo: $(srcdir)/generated/matmulavx128_i2.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i2.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i2.Tpo -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i2.Tpo $(DEPDIR)/matmulavx128_i2.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i2.c' object='matmulavx128_i2.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
matmulavx128_i4.lo: $(srcdir)/generated/matmulavx128_i4.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i4.Tpo -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i4.Tpo $(DEPDIR)/matmulavx128_i4.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i4.c' object='matmulavx128_i4.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
matmulavx128_i8.lo: $(srcdir)/generated/matmulavx128_i8.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i8.Tpo -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i8.Tpo $(DEPDIR)/matmulavx128_i8.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i8.c' object='matmulavx128_i8.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
matmulavx128_i16.lo: $(srcdir)/generated/matmulavx128_i16.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i16.Tpo -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i16.Tpo $(DEPDIR)/matmulavx128_i16.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i16.c' object='matmulavx128_i16.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
matmulavx128_r4.lo: $(srcdir)/generated/matmulavx128_r4.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r4.Tpo -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r4.Tpo $(DEPDIR)/matmulavx128_r4.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r4.c' object='matmulavx128_r4.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
matmulavx128_r8.lo: $(srcdir)/generated/matmulavx128_r8.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r8.Tpo -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r8.Tpo $(DEPDIR)/matmulavx128_r8.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r8.c' object='matmulavx128_r8.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
matmulavx128_r10.lo: $(srcdir)/generated/matmulavx128_r10.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r10.Tpo -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r10.Tpo $(DEPDIR)/matmulavx128_r10.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r10.c' object='matmulavx128_r10.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
matmulavx128_r16.lo: $(srcdir)/generated/matmulavx128_r16.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r16.Tpo -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r16.Tpo $(DEPDIR)/matmulavx128_r16.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r16.c' object='matmulavx128_r16.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
matmulavx128_c4.lo: $(srcdir)/generated/matmulavx128_c4.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c4.Tpo -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c4.Tpo $(DEPDIR)/matmulavx128_c4.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c4.c' object='matmulavx128_c4.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
matmulavx128_c8.lo: $(srcdir)/generated/matmulavx128_c8.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c8.Tpo -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c8.Tpo $(DEPDIR)/matmulavx128_c8.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c8.c' object='matmulavx128_c8.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
matmulavx128_c10.lo: $(srcdir)/generated/matmulavx128_c10.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c10.Tpo -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c10.Tpo $(DEPDIR)/matmulavx128_c10.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c10.c' object='matmulavx128_c10.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
matmulavx128_c16.lo: $(srcdir)/generated/matmulavx128_c16.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c16.Tpo -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c16.Tpo $(DEPDIR)/matmulavx128_c16.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c16.c' object='matmulavx128_c16.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
spread_i1.lo: $(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT spread_i1.lo -MD -MP -MF $(DEPDIR)/spread_i1.Tpo -c -o spread_i1.lo `test -f '$(srcdir)/generated/spread_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/spread_i1.Tpo $(DEPDIR)/spread_i1.Plo
@ -5567,7 +5691,10 @@ uninstall-am: uninstall-cafexeclibLTLIBRARIES \
@LIBGFOR_USE_SYMVER_SUN_TRUE@@LIBGFOR_USE_SYMVER_TRUE@ > $@ || (rm -f $@ ; exit 1)
# Turn on vectorization and loop unrolling for matmul.
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
@HAVE_AVX128_TRUE@$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
# Logical matmul doesn't vectorize.
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
@ -5667,6 +5794,9 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
@MAINTAINER_MODE_TRUE@$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@

View File

@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for FMA3
dnl
AC_DEFUN([LIBGFOR_CHECK_FMA3], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma -mno-fma4"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}]], [[]])],
AC_DEFINE(HAVE_FMA3, 1,
[Define if FMA3 instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for FMA4
dnl
AC_DEFUN([LIBGFOR_CHECK_FMA4], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma4 -mno-fma"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}]], [[]])],
AC_DEFINE(HAVE_FMA4, 1,
[Define if FMA4 instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for -mprefer-avx128
dnl This also defines an automake conditional.
AC_DEFUN([LIBGFOR_CHECK_AVX128], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx -mprefer-avx128"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
void foo()
{
}]], [[]])],
AC_DEFINE(HAVE_AVX128, 1,
[Define if -mprefer-avx128 is supported.])
AM_CONDITIONAL([HAVE_AVX128],true),
[])
CFLAGS="$ac_save_CFLAGS"
])

View File

@ -81,6 +81,9 @@
/* Define if AVX instructions can be compiled. */
#undef HAVE_AVX
/* Define if -mprefer-avx128 is supported. */
#undef HAVE_AVX128
/* Define if AVX2 instructions can be compiled. */
#undef HAVE_AVX2
@ -375,6 +378,12 @@
/* Define to 1 if you have the `floorl' function. */
#undef HAVE_FLOORL
/* Define if FMA3 instructions can be compiled. */
#undef HAVE_FMA3
/* Define if FMA4 instructions can be compiled. */
#undef HAVE_FMA4
/* Define to 1 if you have the `fmod' function. */
#undef HAVE_FMOD

103
libgfortran/configure vendored
View File

@ -606,6 +606,8 @@ am__EXEEXT_TRUE
LTLIBOBJS
LIBOBJS
get_gcc_base_ver
HAVE_AVX128_FALSE
HAVE_AVX128_TRUE
IEEE_FLAGS
IEEE_SUPPORT
IEEE_SUPPORT_FALSE
@ -12421,7 +12423,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 12424 "configure"
#line 12426 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@ -12527,7 +12529,7 @@ else
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
lt_status=$lt_dlunknown
cat > conftest.$ac_ext <<_LT_EOF
#line 12530 "configure"
#line 12532 "configure"
#include "confdefs.h"
#if HAVE_DLFCN_H
@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check for FMA3 extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma -mno-fma4"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_FMA3 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check for FMA4 extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mfma4 -mno-fma"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
float
flt_mul_add (float a, float b, float c)
{
return __builtin_fmaf (a, b, c);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_FMA4 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check if AVX128 works
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx -mprefer-avx128"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
void foo()
{
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_AVX128 1" >>confdefs.h
if true; then
HAVE_AVX128_TRUE=
HAVE_AVX128_FALSE='#'
else
HAVE_AVX128_TRUE='#'
HAVE_AVX128_FALSE=
fi
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Determine what GCC version number to use in filesystem paths.
get_gcc_base_ver="cat"
@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
as_fn_error "conditional \"HAVE_AVX128\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
: ${CONFIG_STATUS=./config.status}
ac_write_fail=0

View File

@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
# Check wether we support AVX512f extensions
LIBGFOR_CHECK_AVX512F
# Check for FMA3 extensions
LIBGFOR_CHECK_FMA3
# Check for FMA4 extensions
LIBGFOR_CHECK_FMA4
# Check if AVX128 works
LIBGFOR_CHECK_AVX128
# Determine what GCC version number to use in filesystem paths.
GCC_BASE_VER

View File

@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c10_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c10_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c10_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c10_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_c8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_c8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_c8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_c8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i1_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i1_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i1_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i1_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i2_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i2_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i2_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i2_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_i8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_i8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_i8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_i8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r10_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r10_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r10_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r10_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r16_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r16_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r16_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r16_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r4_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r4_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r4_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r4_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
void
matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto(matmul_r8_avx128_fma3);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
void
matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto(matmul_r8_avx128_fma4);
#endif
/* Function to fall back to if there is no special processor-specific version. */
static void
matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_r8_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_r8_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX512F */
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto('matmul_name`);
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto('matmul_name`);
#endif
/* Function to fall back to if there is no special processor-specific version. */
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
`static' include(matmul_internal.m4)dnl
@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
}
#endif /* HAVE_AVX */
}
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
{
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
{
matmul_fn = matmul_'rtype_code`_avx128_fma3;
goto store;
}
#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
{
matmul_fn = matmul_'rtype_code`_avx128_fma4;
goto store;
}
#endif
}
store:
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
}

View File

@ -0,0 +1,67 @@
`/* Implementation of the MATMUL intrinsic
Copyright (C) 2002-2017 Free Software Foundation, Inc.
Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
This file is part of the GNU Fortran runtime library (libgfortran).
Libgfortran is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
Libgfortran is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#include "libgfortran.h"
#include <string.h>
#include <assert.h>'
include(iparm.m4)dnl
/* These are the specific versions of matmul with -mprefer-avx128. */
`#if defined (HAVE_'rtype_name`)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const 'rtype_name` *, const 'rtype_name` *,
const int *, const 'rtype_name` *, const int *,
const 'rtype_name` *, 'rtype_name` *, const int *,
int, int);
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
internal_proto('matmul_name`);
'include(matmul_internal.m4)dnl
`#endif
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
internal_proto('matmul_name`);
'include(matmul_internal.m4)dnl
`#endif
#endif
'