mirror of git://gcc.gnu.org/git/gcc.git
re PR libfortran/78379 (Processor-specific versions for matmul)
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
PR libfortran/78379
* Makefile.am: Add generated/matmulavx128_*.c files.
Handle them for compiling and setting the right flags.
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
* configure.ac: Call them.
* Makefile.in: Regenerated.
* config.h.in: Regenerated.
* configure: Regenerated.
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
versions which use FMA3 or FMA4.
* m4/matmulavx128.m4: New file.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
* generated/matmulavx128_c10.c: New file.
* generated/matmulavx128_c16.c: New file.
* generated/matmulavx128_c4.c: New file.
* generated/matmulavx128_c8.c: New file.
* generated/matmulavx128_i1.c: New file.
* generated/matmulavx128_i16.c: New file.
* generated/matmulavx128_i2.c: New file.
* generated/matmulavx128_i4.c: New file.
* generated/matmulavx128_i8.c: New file.
* generated/matmulavx128_r10.c: New file.
* generated/matmulavx128_r16.c: New file.
* generated/matmulavx128_r4.c: New file.
* generated/matmulavx128_r8.c: New file.
From-SVN: r248472
This commit is contained in:
parent
87e1e6036e
commit
1d5cf7fcf2
|
|
@ -1,3 +1,43 @@
|
|||
2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org>
|
||||
|
||||
PR libfortran/78379
|
||||
* Makefile.am: Add generated/matmulavx128_*.c files.
|
||||
Handle them for compiling and setting the right flags.
|
||||
* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
|
||||
* configure.ac: Call them.
|
||||
* Makefile.in: Regenerated.
|
||||
* config.h.in: Regenerated.
|
||||
* configure: Regenerated.
|
||||
* m4/matmul.m4: Handle AMD chips by calling 128-bit AVX
|
||||
versions which use FMA3 or FMA4.
|
||||
* m4/matmulavx128.m4: New file.
|
||||
* generated/matmul_c10.c: Regenerated.
|
||||
* generated/matmul_c16.c: Regenerated.
|
||||
* generated/matmul_c4.c: Regenerated.
|
||||
* generated/matmul_c8.c: Regenerated.
|
||||
* generated/matmul_i1.c: Regenerated.
|
||||
* generated/matmul_i16.c: Regenerated.
|
||||
* generated/matmul_i2.c: Regenerated.
|
||||
* generated/matmul_i4.c: Regenerated.
|
||||
* generated/matmul_i8.c: Regenerated.
|
||||
* generated/matmul_r10.c: Regenerated.
|
||||
* generated/matmul_r16.c: Regenerated.
|
||||
* generated/matmul_r4.c: Regenerated.
|
||||
* generated/matmul_r8.c: Regenerated.
|
||||
* generated/matmulavx128_c10.c: New file.
|
||||
* generated/matmulavx128_c16.c: New file.
|
||||
* generated/matmulavx128_c4.c: New file.
|
||||
* generated/matmulavx128_c8.c: New file.
|
||||
* generated/matmulavx128_i1.c: New file.
|
||||
* generated/matmulavx128_i16.c: New file.
|
||||
* generated/matmulavx128_i2.c: New file.
|
||||
* generated/matmulavx128_i4.c: New file.
|
||||
* generated/matmulavx128_i8.c: New file.
|
||||
* generated/matmulavx128_r10.c: New file.
|
||||
* generated/matmulavx128_r16.c: New file.
|
||||
* generated/matmulavx128_r4.c: New file.
|
||||
* generated/matmulavx128_r8.c: New file.
|
||||
|
||||
2017-05-19 Paul Thomas <pault@gcc.gnu.org>
|
||||
Jerry DeLisle <jvdelisle@gcc.gnu.org>
|
||||
|
||||
|
|
@ -14,7 +54,7 @@
|
|||
(st_endfile): Likewise.
|
||||
(st_rewind): Likewise.
|
||||
(st_flush): Likewise.
|
||||
|
||||
|
||||
2017-05-15 Jerry DeLisle <jvdelisle@gcc.gnu.org>
|
||||
|
||||
PR libgfortran/80727
|
||||
|
|
|
|||
|
|
@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
|
|||
$(srcdir)/generated/matmul_c10.c \
|
||||
$(srcdir)/generated/matmul_c16.c
|
||||
|
||||
i_matmulavx128_c= \
|
||||
$(srcdir)/generated/matmulavx128_i1.c \
|
||||
$(srcdir)/generated/matmulavx128_i2.c \
|
||||
$(srcdir)/generated/matmulavx128_i4.c \
|
||||
$(srcdir)/generated/matmulavx128_i8.c \
|
||||
$(srcdir)/generated/matmulavx128_i16.c \
|
||||
$(srcdir)/generated/matmulavx128_r4.c \
|
||||
$(srcdir)/generated/matmulavx128_r8.c \
|
||||
$(srcdir)/generated/matmulavx128_r10.c \
|
||||
$(srcdir)/generated/matmulavx128_r16.c \
|
||||
$(srcdir)/generated/matmulavx128_c4.c \
|
||||
$(srcdir)/generated/matmulavx128_c8.c \
|
||||
$(srcdir)/generated/matmulavx128_c10.c \
|
||||
$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
i_matmull_c= \
|
||||
$(srcdir)/generated/matmul_l4.c \
|
||||
$(srcdir)/generated/matmul_l8.c \
|
||||
|
|
@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
|
|||
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
|
||||
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
|
||||
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
|
||||
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
|
||||
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
|
||||
|
||||
|
|
@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
|
|||
intrinsics/f2c_specifics.F90
|
||||
|
||||
# Turn on vectorization and loop unrolling for matmul.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
|
||||
if HAVE_AVX128
|
||||
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
|
||||
endif
|
||||
# Logical matmul doesn't vectorize.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
|
||||
|
||||
|
|
@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
|
|||
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
|
||||
|
||||
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
|
||||
|
||||
|
|
|
|||
|
|
@ -289,15 +289,20 @@ am__objects_32 = unpack_i1.lo unpack_i2.lo unpack_i4.lo unpack_i8.lo \
|
|||
unpack_i16.lo unpack_r4.lo unpack_r8.lo unpack_r10.lo \
|
||||
unpack_r16.lo unpack_c4.lo unpack_c8.lo unpack_c10.lo \
|
||||
unpack_c16.lo
|
||||
am__objects_33 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
|
||||
am__objects_33 = matmulavx128_i1.lo matmulavx128_i2.lo \
|
||||
matmulavx128_i4.lo matmulavx128_i8.lo matmulavx128_i16.lo \
|
||||
matmulavx128_r4.lo matmulavx128_r8.lo matmulavx128_r10.lo \
|
||||
matmulavx128_r16.lo matmulavx128_c4.lo matmulavx128_c8.lo \
|
||||
matmulavx128_c10.lo matmulavx128_c16.lo
|
||||
am__objects_34 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
|
||||
spread_i16.lo spread_r4.lo spread_r8.lo spread_r10.lo \
|
||||
spread_r16.lo spread_c4.lo spread_c8.lo spread_c10.lo \
|
||||
spread_c16.lo
|
||||
am__objects_34 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
|
||||
am__objects_35 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
|
||||
cshift0_i8.lo cshift0_i16.lo cshift0_r4.lo cshift0_r8.lo \
|
||||
cshift0_r10.lo cshift0_r16.lo cshift0_c4.lo cshift0_c8.lo \
|
||||
cshift0_c10.lo cshift0_c16.lo
|
||||
am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
||||
am__objects_36 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
||||
$(am__objects_7) $(am__objects_8) $(am__objects_9) \
|
||||
$(am__objects_10) $(am__objects_11) $(am__objects_12) \
|
||||
$(am__objects_13) $(am__objects_14) $(am__objects_15) \
|
||||
|
|
@ -307,14 +312,14 @@ am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
|
|||
$(am__objects_25) $(am__objects_26) $(am__objects_27) \
|
||||
$(am__objects_28) $(am__objects_29) $(am__objects_30) \
|
||||
$(am__objects_31) $(am__objects_32) $(am__objects_33) \
|
||||
$(am__objects_34)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_36 = close.lo file_pos.lo format.lo \
|
||||
$(am__objects_34) $(am__objects_35)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_37 = close.lo file_pos.lo format.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ inquire.lo intrinsics.lo list_read.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ lock.lo open.lo read.lo transfer.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ transfer128.lo unit.lo unix.lo write.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ fbuf.lo
|
||||
am__objects_37 = size_from_kind.lo $(am__objects_36)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_38 = access.lo c99_functions.lo \
|
||||
am__objects_38 = size_from_kind.lo $(am__objects_37)
|
||||
@LIBGFOR_MINIMAL_FALSE@am__objects_39 = access.lo c99_functions.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ chdir.lo chmod.lo clock.lo cpu_time.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ ctime.lo date_and_time.lo dtime.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ env.lo etime.lo execute_command_line.lo \
|
||||
|
|
@ -324,19 +329,19 @@ am__objects_37 = size_from_kind.lo $(am__objects_36)
|
|||
@LIBGFOR_MINIMAL_FALSE@ rename.lo stat.lo symlnk.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ system_clock.lo time.lo umask.lo \
|
||||
@LIBGFOR_MINIMAL_FALSE@ unlink.lo
|
||||
@IEEE_SUPPORT_TRUE@am__objects_39 = ieee_helper.lo
|
||||
am__objects_40 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
|
||||
@IEEE_SUPPORT_TRUE@am__objects_40 = ieee_helper.lo
|
||||
am__objects_41 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
|
||||
eoshift2.lo erfc_scaled.lo extends_type_of.lo fnum.lo \
|
||||
ierrno.lo ishftc.lo mvbits.lo move_alloc.lo pack_generic.lo \
|
||||
selected_char_kind.lo size.lo spread_generic.lo \
|
||||
string_intrinsics.lo rand.lo random.lo reshape_generic.lo \
|
||||
reshape_packed.lo selected_int_kind.lo selected_real_kind.lo \
|
||||
unpack_generic.lo in_pack_generic.lo in_unpack_generic.lo \
|
||||
$(am__objects_38) $(am__objects_39)
|
||||
@IEEE_SUPPORT_TRUE@am__objects_41 = ieee_arithmetic.lo \
|
||||
$(am__objects_39) $(am__objects_40)
|
||||
@IEEE_SUPPORT_TRUE@am__objects_42 = ieee_arithmetic.lo \
|
||||
@IEEE_SUPPORT_TRUE@ ieee_exceptions.lo ieee_features.lo
|
||||
am__objects_42 =
|
||||
am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
||||
am__objects_43 =
|
||||
am__objects_44 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
||||
_abs_i4.lo _abs_i8.lo _abs_i16.lo _abs_r4.lo _abs_r8.lo \
|
||||
_abs_r10.lo _abs_r16.lo _aimag_c4.lo _aimag_c8.lo \
|
||||
_aimag_c10.lo _aimag_c16.lo _exp_r4.lo _exp_r8.lo _exp_r10.lo \
|
||||
|
|
@ -360,19 +365,19 @@ am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
|
|||
_conjg_c4.lo _conjg_c8.lo _conjg_c10.lo _conjg_c16.lo \
|
||||
_aint_r4.lo _aint_r8.lo _aint_r10.lo _aint_r16.lo _anint_r4.lo \
|
||||
_anint_r8.lo _anint_r10.lo _anint_r16.lo
|
||||
am__objects_44 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
|
||||
am__objects_45 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
|
||||
_sign_r8.lo _sign_r10.lo _sign_r16.lo _dim_i4.lo _dim_i8.lo \
|
||||
_dim_i16.lo _dim_r4.lo _dim_r8.lo _dim_r10.lo _dim_r16.lo \
|
||||
_atan2_r4.lo _atan2_r8.lo _atan2_r10.lo _atan2_r16.lo \
|
||||
_mod_i4.lo _mod_i8.lo _mod_i16.lo _mod_r4.lo _mod_r8.lo \
|
||||
_mod_r10.lo _mod_r16.lo
|
||||
am__objects_45 = misc_specifics.lo
|
||||
am__objects_46 = $(am__objects_43) $(am__objects_44) $(am__objects_45) \
|
||||
am__objects_46 = misc_specifics.lo
|
||||
am__objects_47 = $(am__objects_44) $(am__objects_45) $(am__objects_46) \
|
||||
dprod_r8.lo f2c_specifics.lo
|
||||
am__objects_47 = $(am__objects_3) $(am__objects_35) $(am__objects_37) \
|
||||
$(am__objects_40) $(am__objects_41) $(am__objects_42) \
|
||||
$(am__objects_46)
|
||||
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_47)
|
||||
am__objects_48 = $(am__objects_3) $(am__objects_36) $(am__objects_38) \
|
||||
$(am__objects_41) $(am__objects_42) $(am__objects_43) \
|
||||
$(am__objects_47)
|
||||
@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_48)
|
||||
@onestep_TRUE@am_libgfortran_la_OBJECTS = libgfortran_c.lo
|
||||
libgfortran_la_OBJECTS = $(am_libgfortran_la_OBJECTS)
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@
|
||||
|
|
@ -879,6 +884,21 @@ $(srcdir)/generated/matmul_c8.c \
|
|||
$(srcdir)/generated/matmul_c10.c \
|
||||
$(srcdir)/generated/matmul_c16.c
|
||||
|
||||
i_matmulavx128_c = \
|
||||
$(srcdir)/generated/matmulavx128_i1.c \
|
||||
$(srcdir)/generated/matmulavx128_i2.c \
|
||||
$(srcdir)/generated/matmulavx128_i4.c \
|
||||
$(srcdir)/generated/matmulavx128_i8.c \
|
||||
$(srcdir)/generated/matmulavx128_i16.c \
|
||||
$(srcdir)/generated/matmulavx128_r4.c \
|
||||
$(srcdir)/generated/matmulavx128_r8.c \
|
||||
$(srcdir)/generated/matmulavx128_r10.c \
|
||||
$(srcdir)/generated/matmulavx128_r16.c \
|
||||
$(srcdir)/generated/matmulavx128_c4.c \
|
||||
$(srcdir)/generated/matmulavx128_c8.c \
|
||||
$(srcdir)/generated/matmulavx128_c10.c \
|
||||
$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
i_matmull_c = \
|
||||
$(srcdir)/generated/matmul_l4.c \
|
||||
$(srcdir)/generated/matmul_l8.c \
|
||||
|
|
@ -1059,7 +1079,7 @@ gfor_built_src = $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
|
|||
$(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
|
||||
$(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
|
||||
$(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) \
|
||||
$(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
|
||||
$(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
|
||||
$(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc
|
||||
|
||||
|
|
@ -1518,6 +1538,19 @@ distclean-compile:
|
|||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c10.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i1.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i2.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r10.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r4.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r8.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i1.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i16.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i2.Plo@am__quote@
|
||||
|
|
@ -4584,6 +4617,97 @@ unpack_c16.lo: $(srcdir)/generated/unpack_c16.c
|
|||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unpack_c16.lo `test -f '$(srcdir)/generated/unpack_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/unpack_c16.c
|
||||
|
||||
matmulavx128_i1.lo: $(srcdir)/generated/matmulavx128_i1.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i1.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i1.Tpo -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i1.Tpo $(DEPDIR)/matmulavx128_i1.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i1.c' object='matmulavx128_i1.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
|
||||
|
||||
matmulavx128_i2.lo: $(srcdir)/generated/matmulavx128_i2.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i2.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i2.Tpo -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i2.Tpo $(DEPDIR)/matmulavx128_i2.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i2.c' object='matmulavx128_i2.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
|
||||
|
||||
matmulavx128_i4.lo: $(srcdir)/generated/matmulavx128_i4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i4.Tpo -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i4.Tpo $(DEPDIR)/matmulavx128_i4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i4.c' object='matmulavx128_i4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
|
||||
|
||||
matmulavx128_i8.lo: $(srcdir)/generated/matmulavx128_i8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i8.Tpo -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i8.Tpo $(DEPDIR)/matmulavx128_i8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i8.c' object='matmulavx128_i8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
|
||||
|
||||
matmulavx128_i16.lo: $(srcdir)/generated/matmulavx128_i16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i16.Tpo -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_i16.Tpo $(DEPDIR)/matmulavx128_i16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_i16.c' object='matmulavx128_i16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
|
||||
|
||||
matmulavx128_r4.lo: $(srcdir)/generated/matmulavx128_r4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r4.Tpo -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r4.Tpo $(DEPDIR)/matmulavx128_r4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r4.c' object='matmulavx128_r4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
|
||||
|
||||
matmulavx128_r8.lo: $(srcdir)/generated/matmulavx128_r8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r8.Tpo -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r8.Tpo $(DEPDIR)/matmulavx128_r8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r8.c' object='matmulavx128_r8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
|
||||
|
||||
matmulavx128_r10.lo: $(srcdir)/generated/matmulavx128_r10.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r10.Tpo -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r10.Tpo $(DEPDIR)/matmulavx128_r10.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r10.c' object='matmulavx128_r10.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
|
||||
|
||||
matmulavx128_r16.lo: $(srcdir)/generated/matmulavx128_r16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r16.Tpo -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_r16.Tpo $(DEPDIR)/matmulavx128_r16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_r16.c' object='matmulavx128_r16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
|
||||
|
||||
matmulavx128_c4.lo: $(srcdir)/generated/matmulavx128_c4.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c4.Tpo -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c4.Tpo $(DEPDIR)/matmulavx128_c4.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c4.c' object='matmulavx128_c4.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
|
||||
|
||||
matmulavx128_c8.lo: $(srcdir)/generated/matmulavx128_c8.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c8.Tpo -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c8.Tpo $(DEPDIR)/matmulavx128_c8.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c8.c' object='matmulavx128_c8.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
|
||||
|
||||
matmulavx128_c10.lo: $(srcdir)/generated/matmulavx128_c10.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c10.Tpo -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c10.Tpo $(DEPDIR)/matmulavx128_c10.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c10.c' object='matmulavx128_c10.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
|
||||
|
||||
matmulavx128_c16.lo: $(srcdir)/generated/matmulavx128_c16.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c16.Tpo -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/matmulavx128_c16.Tpo $(DEPDIR)/matmulavx128_c16.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='$(srcdir)/generated/matmulavx128_c16.c' object='matmulavx128_c16.lo' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
|
||||
|
||||
spread_i1.lo: $(srcdir)/generated/spread_i1.c
|
||||
@am__fastdepCC_TRUE@ $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT spread_i1.lo -MD -MP -MF $(DEPDIR)/spread_i1.Tpo -c -o spread_i1.lo `test -f '$(srcdir)/generated/spread_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/spread_i1.c
|
||||
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/spread_i1.Tpo $(DEPDIR)/spread_i1.Plo
|
||||
|
|
@ -5567,7 +5691,10 @@ uninstall-am: uninstall-cafexeclibLTLIBRARIES \
|
|||
@LIBGFOR_USE_SYMVER_SUN_TRUE@@LIBGFOR_USE_SYMVER_TRUE@ > $@ || (rm -f $@ ; exit 1)
|
||||
|
||||
# Turn on vectorization and loop unrolling for matmul.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
|
||||
|
||||
# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
|
||||
@HAVE_AVX128_TRUE@$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
|
||||
# Logical matmul doesn't vectorize.
|
||||
$(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops
|
||||
|
||||
|
|
@ -5667,6 +5794,9 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
|
|||
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
|
||||
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@
|
||||
|
||||
|
|
|
|||
|
|
@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
|
|||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for FMA3
|
||||
dnl
|
||||
AC_DEFUN([LIBGFOR_CHECK_FMA3], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma -mno-fma4"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_FMA3, 1,
|
||||
[Define if FMA3 instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for FMA4
|
||||
dnl
|
||||
AC_DEFUN([LIBGFOR_CHECK_FMA4], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma4 -mno-fma"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_FMA4, 1,
|
||||
[Define if FMA4 instructions can be compiled.]),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
||||
dnl Check for -mprefer-avx128
|
||||
dnl This also defines an automake conditional.
|
||||
AC_DEFUN([LIBGFOR_CHECK_AVX128], [
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx -mprefer-avx128"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
|
||||
void foo()
|
||||
{
|
||||
}]], [[]])],
|
||||
AC_DEFINE(HAVE_AVX128, 1,
|
||||
[Define if -mprefer-avx128 is supported.])
|
||||
AM_CONDITIONAL([HAVE_AVX128],true),
|
||||
[])
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
])
|
||||
|
|
|
|||
|
|
@ -81,6 +81,9 @@
|
|||
/* Define if AVX instructions can be compiled. */
|
||||
#undef HAVE_AVX
|
||||
|
||||
/* Define if -mprefer-avx128 is supported. */
|
||||
#undef HAVE_AVX128
|
||||
|
||||
/* Define if AVX2 instructions can be compiled. */
|
||||
#undef HAVE_AVX2
|
||||
|
||||
|
|
@ -375,6 +378,12 @@
|
|||
/* Define to 1 if you have the `floorl' function. */
|
||||
#undef HAVE_FLOORL
|
||||
|
||||
/* Define if FMA3 instructions can be compiled. */
|
||||
#undef HAVE_FMA3
|
||||
|
||||
/* Define if FMA4 instructions can be compiled. */
|
||||
#undef HAVE_FMA4
|
||||
|
||||
/* Define to 1 if you have the `fmod' function. */
|
||||
#undef HAVE_FMOD
|
||||
|
||||
|
|
|
|||
|
|
@ -606,6 +606,8 @@ am__EXEEXT_TRUE
|
|||
LTLIBOBJS
|
||||
LIBOBJS
|
||||
get_gcc_base_ver
|
||||
HAVE_AVX128_FALSE
|
||||
HAVE_AVX128_TRUE
|
||||
IEEE_FLAGS
|
||||
IEEE_SUPPORT
|
||||
IEEE_SUPPORT_FALSE
|
||||
|
|
@ -12421,7 +12423,7 @@ else
|
|||
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
||||
lt_status=$lt_dlunknown
|
||||
cat > conftest.$ac_ext <<_LT_EOF
|
||||
#line 12424 "configure"
|
||||
#line 12426 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
#if HAVE_DLFCN_H
|
||||
|
|
@ -12527,7 +12529,7 @@ else
|
|||
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
||||
lt_status=$lt_dlunknown
|
||||
cat > conftest.$ac_ext <<_LT_EOF
|
||||
#line 12530 "configure"
|
||||
#line 12532 "configure"
|
||||
#include "confdefs.h"
|
||||
|
||||
#if HAVE_DLFCN_H
|
||||
|
|
@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
|||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check for FMA3 extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma -mno-fma4"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_FMA3 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check for FMA4 extensions
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mfma4 -mno-fma"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
float
|
||||
flt_mul_add (float a, float b, float c)
|
||||
{
|
||||
return __builtin_fmaf (a, b, c);
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_FMA4 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Check if AVX128 works
|
||||
|
||||
ac_save_CFLAGS="$CFLAGS"
|
||||
CFLAGS="-O2 -mavx -mprefer-avx128"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
|
||||
void foo()
|
||||
{
|
||||
}
|
||||
int
|
||||
main ()
|
||||
{
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_compile "$LINENO"; then :
|
||||
|
||||
$as_echo "#define HAVE_AVX128 1" >>confdefs.h
|
||||
|
||||
if true; then
|
||||
HAVE_AVX128_TRUE=
|
||||
HAVE_AVX128_FALSE='#'
|
||||
else
|
||||
HAVE_AVX128_TRUE='#'
|
||||
HAVE_AVX128_FALSE=
|
||||
fi
|
||||
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
||||
CFLAGS="$ac_save_CFLAGS"
|
||||
|
||||
|
||||
# Determine what GCC version number to use in filesystem paths.
|
||||
|
||||
get_gcc_base_ver="cat"
|
||||
|
|
@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
|
|||
as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
|
||||
as_fn_error "conditional \"HAVE_AVX128\" was never defined.
|
||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||
fi
|
||||
|
||||
: ${CONFIG_STATUS=./config.status}
|
||||
ac_write_fail=0
|
||||
|
|
|
|||
|
|
@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
|
|||
# Check wether we support AVX512f extensions
|
||||
LIBGFOR_CHECK_AVX512F
|
||||
|
||||
# Check for FMA3 extensions
|
||||
LIBGFOR_CHECK_FMA3
|
||||
|
||||
# Check for FMA4 extensions
|
||||
LIBGFOR_CHECK_FMA4
|
||||
|
||||
# Check if AVX128 works
|
||||
LIBGFOR_CHECK_AVX128
|
||||
|
||||
# Determine what GCC version number to use in filesystem paths.
|
||||
GCC_BASE_VER
|
||||
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c10_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c10_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c10_vanilla (gfc_array_c10 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c10_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c10_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c16_vanilla (gfc_array_c16 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c4_vanilla (gfc_array_c4 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_c8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_c8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_c8_vanilla (gfc_array_c8 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_c8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_c8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i1_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i1_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i1_vanilla (gfc_array_i1 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i1_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i1_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i16_vanilla (gfc_array_i16 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i2_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i2_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i2_vanilla (gfc_array_i2 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i2_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i2_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i4_vanilla (gfc_array_i4 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_i8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_i8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_i8_vanilla (gfc_array_i8 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_i8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_i8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r10_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r10_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r10_vanilla (gfc_array_r10 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r10_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r10_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r16_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r16_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r16_vanilla (gfc_array_r16 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r16_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r16_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r4_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r4_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r4_vanilla (gfc_array_r4 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r4_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r4_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,
|
|||
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto(matmul_r8_avx128_fma3);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
void
|
||||
matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto(matmul_r8_avx128_fma4);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
static void
|
||||
matmul_r8_vanilla (gfc_array_r8 * const restrict retarray,
|
||||
|
|
@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_r8_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_r8_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
|
|||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX512F */
|
||||
|
||||
/* AMD-specifix funtions with AVX128 and FMA3/FMA4. */
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto('matmul_name`);
|
||||
#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto('matmul_name`);
|
||||
#endif
|
||||
|
||||
/* Function to fall back to if there is no special processor-specific version. */
|
||||
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
|
||||
`static' include(matmul_internal.m4)dnl
|
||||
|
|
@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
|||
}
|
||||
#endif /* HAVE_AVX */
|
||||
}
|
||||
else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
|
||||
{
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_fn = matmul_'rtype_code`_avx128_fma3;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
|
||||
{
|
||||
matmul_fn = matmul_'rtype_code`_avx128_fma4;
|
||||
goto store;
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
store:
|
||||
__atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,67 @@
|
|||
`/* Implementation of the MATMUL intrinsic
|
||||
Copyright (C) 2002-2017 Free Software Foundation, Inc.
|
||||
Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
|
||||
|
||||
This file is part of the GNU Fortran runtime library (libgfortran).
|
||||
|
||||
Libgfortran is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 3 of the License, or (at your option) any later version.
|
||||
|
||||
Libgfortran is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
Under Section 7 of GPL version 3, you are granted additional
|
||||
permissions described in the GCC Runtime Library Exception, version
|
||||
3.1, as published by the Free Software Foundation.
|
||||
|
||||
You should have received a copy of the GNU General Public License and
|
||||
a copy of the GCC Runtime Library Exception along with this program;
|
||||
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "libgfortran.h"
|
||||
#include <string.h>
|
||||
#include <assert.h>'
|
||||
|
||||
include(iparm.m4)dnl
|
||||
|
||||
/* These are the specific versions of matmul with -mprefer-avx128. */
|
||||
|
||||
`#if defined (HAVE_'rtype_name`)
|
||||
|
||||
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||
passed to us by the front-end, in which case we call it for large
|
||||
matrices. */
|
||||
|
||||
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||
const int *, const 'rtype_name` *, const 'rtype_name` *,
|
||||
const int *, const 'rtype_name` *, const int *,
|
||||
const 'rtype_name` *, 'rtype_name` *, const int *,
|
||||
int, int);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
|
||||
internal_proto('matmul_name`);
|
||||
'include(matmul_internal.m4)dnl
|
||||
`#endif
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
|
||||
'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
|
||||
`void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
|
||||
internal_proto('matmul_name`);
|
||||
'include(matmul_internal.m4)dnl
|
||||
`#endif
|
||||
|
||||
#endif
|
||||
'
|
||||
Loading…
Reference in New Issue