re PR libfortran/78379 (Processor-specific versions for matmul)

2017-05-25 Thomas Koenig <tkoenig@gcc.gnu.org> PR libfortran/78379 * Makefile.am: Add generated/matmulavx128_*.c files. Handle them for compiling and setting the right flags. * acinclude.m4: Add tests for FMA3, FMA4 and AVX128. * configure.ac: Call them. * Makefile.in: Regenerated. * config.h.in: Regenerated. * configure: Regenerated. * m4/matmul.m4: Handle AMD chips by calling 128-bit AVX versions which use FMA3 or FMA4. * m4/matmulavx128.m4: New file. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. * generated/matmulavx128_c10.c: New file. * generated/matmulavx128_c16.c: New file. * generated/matmulavx128_c4.c: New file. * generated/matmulavx128_c8.c: New file. * generated/matmulavx128_i1.c: New file. * generated/matmulavx128_i16.c: New file. * generated/matmulavx128_i2.c: New file. * generated/matmulavx128_i4.c: New file. * generated/matmulavx128_i8.c: New file. * generated/matmulavx128_r10.c: New file. * generated/matmulavx128_r16.c: New file. * generated/matmulavx128_r4.c: New file. * generated/matmulavx128_r8.c: New file. From-SVN: r248472
2017-05-25 21:51:27 +00:00 · 2017-05-25 21:51:27 +00:00 · 1d5cf7fcf2
parent 87e1e6036e
commit 1d5cf7fcf2
35 changed files with 15964 additions and 27 deletions
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@ -1,3 +1,43 @@
+2017-05-25  Thomas Koenig  <tkoenig@gcc.gnu.org>
+
+	PR libfortran/78379
+	* Makefile.am: Add generated/matmulavx128_*.c files.
+	Handle them for compiling and setting the right flags.
+	* acinclude.m4: Add tests for FMA3, FMA4 and AVX128.
+	* configure.ac: Call them.
+	* Makefile.in: Regenerated.
+	* config.h.in: Regenerated.
+	* configure: Regenerated.
+	* m4/matmul.m4:  Handle AMD chips by calling 128-bit AVX
+	versions which use FMA3 or FMA4.
+	* m4/matmulavx128.m4: New file.
+        * generated/matmul_c10.c: Regenerated.
+        * generated/matmul_c16.c: Regenerated.
+        * generated/matmul_c4.c: Regenerated.
+        * generated/matmul_c8.c: Regenerated.
+        * generated/matmul_i1.c: Regenerated.
+        * generated/matmul_i16.c: Regenerated.
+        * generated/matmul_i2.c: Regenerated.
+        * generated/matmul_i4.c: Regenerated.
+        * generated/matmul_i8.c: Regenerated.
+        * generated/matmul_r10.c: Regenerated.
+        * generated/matmul_r16.c: Regenerated.
+        * generated/matmul_r4.c: Regenerated.
+        * generated/matmul_r8.c: Regenerated.
+        * generated/matmulavx128_c10.c: New file.
+        * generated/matmulavx128_c16.c: New file.
+        * generated/matmulavx128_c4.c: New file.
+        * generated/matmulavx128_c8.c: New file.
+        * generated/matmulavx128_i1.c: New file.
+        * generated/matmulavx128_i16.c: New file.
+        * generated/matmulavx128_i2.c: New file.
+        * generated/matmulavx128_i4.c: New file.
+        * generated/matmulavx128_i8.c: New file.
+        * generated/matmulavx128_r10.c: New file.
+        * generated/matmulavx128_r16.c: New file.
+        * generated/matmulavx128_r4.c: New file.
+        * generated/matmulavx128_r8.c: New file.
+
 2017-05-19  Paul Thomas  <pault@gcc.gnu.org>
 	    Jerry DeLisle  <jvdelisle@gcc.gnu.org>

@ -14,7 +54,7 @@
 	(st_endfile): Likewise.
 	(st_rewind): Likewise.
 	(st_flush): Likewise.
-	
+
 2017-05-15  Jerry DeLisle  <jvdelisle@gcc.gnu.org>

 	PR libgfortran/80727
--- a/libgfortran/Makefile.am
+++ b/libgfortran/Makefile.am
@ -460,6 +460,21 @@ $(srcdir)/generated/matmul_c8.c \
 $(srcdir)/generated/matmul_c10.c \
 $(srcdir)/generated/matmul_c16.c

+i_matmulavx128_c= \
+$(srcdir)/generated/matmulavx128_i1.c \
+$(srcdir)/generated/matmulavx128_i2.c \
+$(srcdir)/generated/matmulavx128_i4.c \
+$(srcdir)/generated/matmulavx128_i8.c \
+$(srcdir)/generated/matmulavx128_i16.c \
+$(srcdir)/generated/matmulavx128_r4.c \
+$(srcdir)/generated/matmulavx128_r8.c \
+$(srcdir)/generated/matmulavx128_r10.c \
+$(srcdir)/generated/matmulavx128_r16.c \
+$(srcdir)/generated/matmulavx128_c4.c \
+$(srcdir)/generated/matmulavx128_c8.c \
+$(srcdir)/generated/matmulavx128_c10.c \
+$(srcdir)/generated/matmulavx128_c16.c
+
 i_matmull_c= \
 $(srcdir)/generated/matmul_l4.c \
 $(srcdir)/generated/matmul_l8.c \
@ -641,7 +656,7 @@ gfor_built_src= $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
    $(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
    $(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
    $(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
-    $(i_pow_c) $(i_pack_c) $(i_unpack_c) \
+    $(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
    $(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
    $(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc

@ -796,7 +811,12 @@ intrinsics/dprod_r8.f90 \
 intrinsics/f2c_specifics.F90

 # Turn on vectorization and loop unrolling for matmul.
-$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 
+$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+
+if HAVE_AVX128
+# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
+$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
+endif
 # Logical matmul doesn't vectorize.
 $(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops

@ -936,6 +956,9 @@ $(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
 $(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
 	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@

+$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
+	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
+
 $(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
 	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@

--- a/libgfortran/Makefile.in
+++ b/libgfortran/Makefile.in
@ -289,15 +289,20 @@ am__objects_32 = unpack_i1.lo unpack_i2.lo unpack_i4.lo unpack_i8.lo \
 	unpack_i16.lo unpack_r4.lo unpack_r8.lo unpack_r10.lo \
 	unpack_r16.lo unpack_c4.lo unpack_c8.lo unpack_c10.lo \
 	unpack_c16.lo
-am__objects_33 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
+am__objects_33 = matmulavx128_i1.lo matmulavx128_i2.lo \
+	matmulavx128_i4.lo matmulavx128_i8.lo matmulavx128_i16.lo \
+	matmulavx128_r4.lo matmulavx128_r8.lo matmulavx128_r10.lo \
+	matmulavx128_r16.lo matmulavx128_c4.lo matmulavx128_c8.lo \
+	matmulavx128_c10.lo matmulavx128_c16.lo
+am__objects_34 = spread_i1.lo spread_i2.lo spread_i4.lo spread_i8.lo \
 	spread_i16.lo spread_r4.lo spread_r8.lo spread_r10.lo \
 	spread_r16.lo spread_c4.lo spread_c8.lo spread_c10.lo \
 	spread_c16.lo
-am__objects_34 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
+am__objects_35 = cshift0_i1.lo cshift0_i2.lo cshift0_i4.lo \
 	cshift0_i8.lo cshift0_i16.lo cshift0_r4.lo cshift0_r8.lo \
 	cshift0_r10.lo cshift0_r16.lo cshift0_c4.lo cshift0_c8.lo \
 	cshift0_c10.lo cshift0_c16.lo
-am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
+am__objects_36 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
 	$(am__objects_7) $(am__objects_8) $(am__objects_9) \
 	$(am__objects_10) $(am__objects_11) $(am__objects_12) \
 	$(am__objects_13) $(am__objects_14) $(am__objects_15) \
@ -307,14 +312,14 @@ am__objects_35 = $(am__objects_4) $(am__objects_5) $(am__objects_6) \
 	$(am__objects_25) $(am__objects_26) $(am__objects_27) \
 	$(am__objects_28) $(am__objects_29) $(am__objects_30) \
 	$(am__objects_31) $(am__objects_32) $(am__objects_33) \
-	$(am__objects_34)
-@LIBGFOR_MINIMAL_FALSE@am__objects_36 = close.lo file_pos.lo format.lo \
+	$(am__objects_34) $(am__objects_35)
+@LIBGFOR_MINIMAL_FALSE@am__objects_37 = close.lo file_pos.lo format.lo \
@LIBGFOR_MINIMAL_FALSE@	inquire.lo intrinsics.lo list_read.lo \
@LIBGFOR_MINIMAL_FALSE@	lock.lo open.lo read.lo transfer.lo \
@LIBGFOR_MINIMAL_FALSE@	transfer128.lo unit.lo unix.lo write.lo \
@LIBGFOR_MINIMAL_FALSE@	fbuf.lo
-am__objects_37 = size_from_kind.lo $(am__objects_36)
-@LIBGFOR_MINIMAL_FALSE@am__objects_38 = access.lo c99_functions.lo \
+am__objects_38 = size_from_kind.lo $(am__objects_37)
+@LIBGFOR_MINIMAL_FALSE@am__objects_39 = access.lo c99_functions.lo \
@LIBGFOR_MINIMAL_FALSE@	chdir.lo chmod.lo clock.lo cpu_time.lo \
@LIBGFOR_MINIMAL_FALSE@	ctime.lo date_and_time.lo dtime.lo \
@LIBGFOR_MINIMAL_FALSE@	env.lo etime.lo execute_command_line.lo \
@ -324,19 +329,19 @@ am__objects_37 = size_from_kind.lo $(am__objects_36)
@LIBGFOR_MINIMAL_FALSE@	rename.lo stat.lo symlnk.lo \
@LIBGFOR_MINIMAL_FALSE@	system_clock.lo time.lo umask.lo \
@LIBGFOR_MINIMAL_FALSE@	unlink.lo
-@IEEE_SUPPORT_TRUE@am__objects_39 = ieee_helper.lo
-am__objects_40 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
+@IEEE_SUPPORT_TRUE@am__objects_40 = ieee_helper.lo
+am__objects_41 = associated.lo abort.lo args.lo cshift0.lo eoshift0.lo \
 	eoshift2.lo erfc_scaled.lo extends_type_of.lo fnum.lo \
 	ierrno.lo ishftc.lo mvbits.lo move_alloc.lo pack_generic.lo \
 	selected_char_kind.lo size.lo spread_generic.lo \
 	string_intrinsics.lo rand.lo random.lo reshape_generic.lo \
 	reshape_packed.lo selected_int_kind.lo selected_real_kind.lo \
 	unpack_generic.lo in_pack_generic.lo in_unpack_generic.lo \
-	$(am__objects_38) $(am__objects_39)
-@IEEE_SUPPORT_TRUE@am__objects_41 = ieee_arithmetic.lo \
+	$(am__objects_39) $(am__objects_40)
+@IEEE_SUPPORT_TRUE@am__objects_42 = ieee_arithmetic.lo \
@IEEE_SUPPORT_TRUE@	ieee_exceptions.lo ieee_features.lo
-am__objects_42 =
-am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
+am__objects_43 =
+am__objects_44 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
 	_abs_i4.lo _abs_i8.lo _abs_i16.lo _abs_r4.lo _abs_r8.lo \
 	_abs_r10.lo _abs_r16.lo _aimag_c4.lo _aimag_c8.lo \
 	_aimag_c10.lo _aimag_c16.lo _exp_r4.lo _exp_r8.lo _exp_r10.lo \
@ -360,19 +365,19 @@ am__objects_43 = _abs_c4.lo _abs_c8.lo _abs_c10.lo _abs_c16.lo \
 	_conjg_c4.lo _conjg_c8.lo _conjg_c10.lo _conjg_c16.lo \
 	_aint_r4.lo _aint_r8.lo _aint_r10.lo _aint_r16.lo _anint_r4.lo \
 	_anint_r8.lo _anint_r10.lo _anint_r16.lo
-am__objects_44 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
+am__objects_45 = _sign_i4.lo _sign_i8.lo _sign_i16.lo _sign_r4.lo \
 	_sign_r8.lo _sign_r10.lo _sign_r16.lo _dim_i4.lo _dim_i8.lo \
 	_dim_i16.lo _dim_r4.lo _dim_r8.lo _dim_r10.lo _dim_r16.lo \
 	_atan2_r4.lo _atan2_r8.lo _atan2_r10.lo _atan2_r16.lo \
 	_mod_i4.lo _mod_i8.lo _mod_i16.lo _mod_r4.lo _mod_r8.lo \
 	_mod_r10.lo _mod_r16.lo
-am__objects_45 = misc_specifics.lo
-am__objects_46 = $(am__objects_43) $(am__objects_44) $(am__objects_45) \
+am__objects_46 = misc_specifics.lo
+am__objects_47 = $(am__objects_44) $(am__objects_45) $(am__objects_46) \
 	dprod_r8.lo f2c_specifics.lo
-am__objects_47 = $(am__objects_3) $(am__objects_35) $(am__objects_37) \
-	$(am__objects_40) $(am__objects_41) $(am__objects_42) \
-	$(am__objects_46)
-@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_47)
+am__objects_48 = $(am__objects_3) $(am__objects_36) $(am__objects_38) \
+	$(am__objects_41) $(am__objects_42) $(am__objects_43) \
+	$(am__objects_47)
+@onestep_FALSE@am_libgfortran_la_OBJECTS = $(am__objects_48)
@onestep_TRUE@am_libgfortran_la_OBJECTS = libgfortran_c.lo
 libgfortran_la_OBJECTS = $(am_libgfortran_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
@ -879,6 +884,21 @@ $(srcdir)/generated/matmul_c8.c \
 $(srcdir)/generated/matmul_c10.c \
 $(srcdir)/generated/matmul_c16.c

+i_matmulavx128_c = \
+$(srcdir)/generated/matmulavx128_i1.c \
+$(srcdir)/generated/matmulavx128_i2.c \
+$(srcdir)/generated/matmulavx128_i4.c \
+$(srcdir)/generated/matmulavx128_i8.c \
+$(srcdir)/generated/matmulavx128_i16.c \
+$(srcdir)/generated/matmulavx128_r4.c \
+$(srcdir)/generated/matmulavx128_r8.c \
+$(srcdir)/generated/matmulavx128_r10.c \
+$(srcdir)/generated/matmulavx128_r16.c \
+$(srcdir)/generated/matmulavx128_c4.c \
+$(srcdir)/generated/matmulavx128_c8.c \
+$(srcdir)/generated/matmulavx128_c10.c \
+$(srcdir)/generated/matmulavx128_c16.c
+
 i_matmull_c = \
 $(srcdir)/generated/matmul_l4.c \
 $(srcdir)/generated/matmul_l8.c \
@ -1059,7 +1079,7 @@ gfor_built_src = $(i_all_c) $(i_any_c) $(i_count_c) $(i_maxloc0_c) \
    $(i_iparity_c) $(i_norm2_c) $(i_parity_c) \
    $(i_matmul_c) $(i_matmull_c) $(i_shape_c) $(i_eoshift1_c) \
    $(i_eoshift3_c) $(i_cshift1_c) $(i_reshape_c) $(in_pack_c) $(in_unpack_c) \
-    $(i_pow_c) $(i_pack_c) $(i_unpack_c) \
+    $(i_pow_c) $(i_pack_c) $(i_unpack_c) $(i_matmulavx128_c) \
    $(i_spread_c) selected_int_kind.inc selected_real_kind.inc kinds.h \
    $(i_cshift0_c) kinds.inc c99_protos.inc fpu-target.h fpu-target.inc

@ -1518,6 +1538,19 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r4.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmul_r8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_c8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i1.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_i8.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r10.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r4.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/matmulavx128_r8.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i1.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i16.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/maxloc0_16_i2.Plo@am__quote@
@ -4584,6 +4617,97 @@ unpack_c16.lo: $(srcdir)/generated/unpack_c16.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o unpack_c16.lo `test -f '$(srcdir)/generated/unpack_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/unpack_c16.c

+matmulavx128_i1.lo: $(srcdir)/generated/matmulavx128_i1.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i1.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i1.Tpo -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_i1.Tpo $(DEPDIR)/matmulavx128_i1.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_i1.c' object='matmulavx128_i1.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i1.lo `test -f '$(srcdir)/generated/matmulavx128_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i1.c
+
+matmulavx128_i2.lo: $(srcdir)/generated/matmulavx128_i2.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i2.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i2.Tpo -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_i2.Tpo $(DEPDIR)/matmulavx128_i2.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_i2.c' object='matmulavx128_i2.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i2.lo `test -f '$(srcdir)/generated/matmulavx128_i2.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i2.c
+
+matmulavx128_i4.lo: $(srcdir)/generated/matmulavx128_i4.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i4.Tpo -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_i4.Tpo $(DEPDIR)/matmulavx128_i4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_i4.c' object='matmulavx128_i4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i4.lo `test -f '$(srcdir)/generated/matmulavx128_i4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i4.c
+
+matmulavx128_i8.lo: $(srcdir)/generated/matmulavx128_i8.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i8.Tpo -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_i8.Tpo $(DEPDIR)/matmulavx128_i8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_i8.c' object='matmulavx128_i8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i8.lo `test -f '$(srcdir)/generated/matmulavx128_i8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i8.c
+
+matmulavx128_i16.lo: $(srcdir)/generated/matmulavx128_i16.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_i16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_i16.Tpo -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_i16.Tpo $(DEPDIR)/matmulavx128_i16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_i16.c' object='matmulavx128_i16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_i16.lo `test -f '$(srcdir)/generated/matmulavx128_i16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_i16.c
+
+matmulavx128_r4.lo: $(srcdir)/generated/matmulavx128_r4.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r4.Tpo -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_r4.Tpo $(DEPDIR)/matmulavx128_r4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_r4.c' object='matmulavx128_r4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r4.lo `test -f '$(srcdir)/generated/matmulavx128_r4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r4.c
+
+matmulavx128_r8.lo: $(srcdir)/generated/matmulavx128_r8.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r8.Tpo -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_r8.Tpo $(DEPDIR)/matmulavx128_r8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_r8.c' object='matmulavx128_r8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r8.lo `test -f '$(srcdir)/generated/matmulavx128_r8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r8.c
+
+matmulavx128_r10.lo: $(srcdir)/generated/matmulavx128_r10.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r10.Tpo -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_r10.Tpo $(DEPDIR)/matmulavx128_r10.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_r10.c' object='matmulavx128_r10.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r10.lo `test -f '$(srcdir)/generated/matmulavx128_r10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r10.c
+
+matmulavx128_r16.lo: $(srcdir)/generated/matmulavx128_r16.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_r16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_r16.Tpo -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_r16.Tpo $(DEPDIR)/matmulavx128_r16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_r16.c' object='matmulavx128_r16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_r16.lo `test -f '$(srcdir)/generated/matmulavx128_r16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_r16.c
+
+matmulavx128_c4.lo: $(srcdir)/generated/matmulavx128_c4.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c4.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c4.Tpo -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_c4.Tpo $(DEPDIR)/matmulavx128_c4.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_c4.c' object='matmulavx128_c4.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c4.lo `test -f '$(srcdir)/generated/matmulavx128_c4.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c4.c
+
+matmulavx128_c8.lo: $(srcdir)/generated/matmulavx128_c8.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c8.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c8.Tpo -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_c8.Tpo $(DEPDIR)/matmulavx128_c8.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_c8.c' object='matmulavx128_c8.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c8.lo `test -f '$(srcdir)/generated/matmulavx128_c8.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c8.c
+
+matmulavx128_c10.lo: $(srcdir)/generated/matmulavx128_c10.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c10.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c10.Tpo -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_c10.Tpo $(DEPDIR)/matmulavx128_c10.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_c10.c' object='matmulavx128_c10.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c10.lo `test -f '$(srcdir)/generated/matmulavx128_c10.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c10.c
+
+matmulavx128_c16.lo: $(srcdir)/generated/matmulavx128_c16.c
+@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT matmulavx128_c16.lo -MD -MP -MF $(DEPDIR)/matmulavx128_c16.Tpo -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/matmulavx128_c16.Tpo $(DEPDIR)/matmulavx128_c16.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/generated/matmulavx128_c16.c' object='matmulavx128_c16.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o matmulavx128_c16.lo `test -f '$(srcdir)/generated/matmulavx128_c16.c' || echo '$(srcdir)/'`$(srcdir)/generated/matmulavx128_c16.c
+
 spread_i1.lo: $(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT spread_i1.lo -MD -MP -MF $(DEPDIR)/spread_i1.Tpo -c -o spread_i1.lo `test -f '$(srcdir)/generated/spread_i1.c' || echo '$(srcdir)/'`$(srcdir)/generated/spread_i1.c
@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/spread_i1.Tpo $(DEPDIR)/spread_i1.Plo
@ -5567,7 +5691,10 @@ uninstall-am: uninstall-cafexeclibLTLIBRARIES \
@LIBGFOR_USE_SYMVER_SUN_TRUE@@LIBGFOR_USE_SYMVER_TRUE@	 > $@ || (rm -f $@ ; exit 1)

 # Turn on vectorization and loop unrolling for matmul.
-$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 
+$(patsubst %.c,%.lo,$(notdir $(i_matmul_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4
+
+# Turn on AVX128 for AMD-specific matmul, but only if the compiler understands -mprefer-avx128
+@HAVE_AVX128_TRUE@$(patsubst %.c,%.lo,$(notdir $(i_matmulavx128_c))): AM_CFLAGS += -ffast-math -ftree-vectorize -funroll-loops --param max-unroll-times=4 -mprefer-avx128
 # Logical matmul doesn't vectorize.
 $(patsubst %.c,%.lo,$(notdir $(i_matmull_c))): AM_CFLAGS += -funroll-loops

@ -5667,6 +5794,9 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@

+@MAINTAINER_MODE_TRUE@$(i_matmulavx128_c): m4/matmulavx128.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
+@MAINTAINER_MODE_TRUE@	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmulavx128.m4 > $@
+
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@	$(M4) -Dfile=$@ -I$(srcdir)/m4 matmull.m4 > $@

--- a/libgfortran/acinclude.m4
+++ b/libgfortran/acinclude.m4
@ -452,3 +452,53 @@ AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
 	[])
  CFLAGS="$ac_save_CFLAGS"
 ])
+
+dnl Check for FMA3
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA3], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma -mno-fma4"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }]], [[]])],
+	AC_DEFINE(HAVE_FMA3, 1,
+	[Define if FMA3 instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for FMA4
+dnl
+AC_DEFUN([LIBGFOR_CHECK_FMA4], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma4 -mno-fma"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }]], [[]])],
+	AC_DEFINE(HAVE_FMA4, 1,
+	[Define if FMA4 instructions can be compiled.]),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
+
+dnl Check for -mprefer-avx128
+dnl This also defines an automake conditional.
+AC_DEFUN([LIBGFOR_CHECK_AVX128], [
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx -mprefer-avx128"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+        void foo()
+	{
+        }]], [[]])],
+	AC_DEFINE(HAVE_AVX128, 1,
+	[Define if -mprefer-avx128 is supported.])
+	AM_CONDITIONAL([HAVE_AVX128],true),
+	[])
+  CFLAGS="$ac_save_CFLAGS"
+])
--- a/libgfortran/config.h.in
+++ b/libgfortran/config.h.in
@ -81,6 +81,9 @@
 /* Define if AVX instructions can be compiled. */
 #undef HAVE_AVX

+/* Define if -mprefer-avx128 is supported. */
+#undef HAVE_AVX128
+
 /* Define if AVX2 instructions can be compiled. */
 #undef HAVE_AVX2

@ -375,6 +378,12 @@
 /* Define to 1 if you have the `floorl' function. */
 #undef HAVE_FLOORL

+/* Define if FMA3 instructions can be compiled. */
+#undef HAVE_FMA3
+
+/* Define if FMA4 instructions can be compiled. */
+#undef HAVE_FMA4
+
 /* Define to 1 if you have the `fmod' function. */
 #undef HAVE_FMOD

--- a/libgfortran/configure
+++ b/libgfortran/configure
@ -606,6 +606,8 @@ am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
 get_gcc_base_ver
+HAVE_AVX128_FALSE
+HAVE_AVX128_TRUE
 IEEE_FLAGS
 IEEE_SUPPORT
 IEEE_SUPPORT_FALSE
@ -12421,7 +12423,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<_LT_EOF
-#line 12424 "configure"
+#line 12426 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@ -12527,7 +12529,7 @@ else
  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
  lt_status=$lt_dlunknown
  cat > conftest.$ac_ext <<_LT_EOF
-#line 12530 "configure"
+#line 12532 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@ -26363,6 +26365,99 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
  CFLAGS="$ac_save_CFLAGS"


+# Check for FMA3 extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma -mno-fma4"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA3 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check for FMA4 extensions
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mfma4 -mno-fma"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+	float
+	flt_mul_add (float a, float b, float c)
+	{
+		return __builtin_fmaf (a, b, c);
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_FMA4 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
+# Check if AVX128 works
+
+  ac_save_CFLAGS="$CFLAGS"
+  CFLAGS="-O2 -mavx -mprefer-avx128"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+        void foo()
+	{
+        }
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX128 1" >>confdefs.h
+
+	 if true; then
+  HAVE_AVX128_TRUE=
+  HAVE_AVX128_FALSE='#'
+else
+  HAVE_AVX128_TRUE='#'
+  HAVE_AVX128_FALSE=
+fi
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  CFLAGS="$ac_save_CFLAGS"
+
+
 # Determine what GCC version number to use in filesystem paths.

  get_gcc_base_ver="cat"
@ -26615,6 +26710,10 @@ if test -z "${IEEE_SUPPORT_TRUE}" && test -z "${IEEE_SUPPORT_FALSE}"; then
  as_fn_error "conditional \"IEEE_SUPPORT\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${HAVE_AVX128_TRUE}" && test -z "${HAVE_AVX128_FALSE}"; then
+  as_fn_error "conditional \"HAVE_AVX128\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi

 : ${CONFIG_STATUS=./config.status}
 ac_write_fail=0
--- a/libgfortran/configure.ac
+++ b/libgfortran/configure.ac
@ -624,6 +624,15 @@ LIBGFOR_CHECK_AVX2
 # Check wether we support AVX512f extensions
 LIBGFOR_CHECK_AVX512F

+# Check for FMA3 extensions
+LIBGFOR_CHECK_FMA3
+
+# Check for FMA4 extensions
+LIBGFOR_CHECK_FMA4
+
+# Check if AVX128 works
+LIBGFOR_CHECK_AVX128
+
 # Determine what GCC version number to use in filesystem paths.
 GCC_BASE_VER

--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@ -1734,6 +1734,24 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma3 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c10_avx128_fma4 (gfc_array_c10 * const restrict retarray, 
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c10_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c10_vanilla (gfc_array_c10 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c10_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c10_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@ -1734,6 +1734,24 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma3 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c16_avx128_fma4 (gfc_array_c16 * const restrict retarray, 
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c16_vanilla (gfc_array_c16 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@ -1734,6 +1734,24 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma3 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c4_avx128_fma4 (gfc_array_c4 * const restrict retarray, 
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c4_vanilla (gfc_array_c4 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@ -1734,6 +1734,24 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma3 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_c8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_c8_avx128_fma4 (gfc_array_c8 * const restrict retarray, 
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_c8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_c8_vanilla (gfc_array_c8 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_c8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_c8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_i1.c
+++ b/libgfortran/generated/matmul_i1.c
@ -1734,6 +1734,24 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma3 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i1_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i1_avx128_fma4 (gfc_array_i1 * const restrict retarray, 
+	gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i1_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i1_vanilla (gfc_array_i1 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i1_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i1_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@ -1734,6 +1734,24 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma3 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i16_avx128_fma4 (gfc_array_i16 * const restrict retarray, 
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i16_vanilla (gfc_array_i16 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_i2.c
+++ b/libgfortran/generated/matmul_i2.c
@ -1734,6 +1734,24 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma3 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i2_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i2_avx128_fma4 (gfc_array_i2 * const restrict retarray, 
+	gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i2_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i2_vanilla (gfc_array_i2 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i2_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i2_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@ -1734,6 +1734,24 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma3 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i4_avx128_fma4 (gfc_array_i4 * const restrict retarray, 
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i4_vanilla (gfc_array_i4 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@ -1734,6 +1734,24 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma3 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_i8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_i8_avx128_fma4 (gfc_array_i8 * const restrict retarray, 
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_i8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_i8_vanilla (gfc_array_i8 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_i8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_i8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@ -1734,6 +1734,24 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma3 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r10_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r10_avx128_fma4 (gfc_array_r10 * const restrict retarray, 
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r10_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r10_vanilla (gfc_array_r10 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r10_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r10_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@ -1734,6 +1734,24 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma3 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r16_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r16_avx128_fma4 (gfc_array_r16 * const restrict retarray, 
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r16_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r16_vanilla (gfc_array_r16 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r16_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r16_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@ -1734,6 +1734,24 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma3 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r4_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r4_avx128_fma4 (gfc_array_r4 * const restrict retarray, 
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r4_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r4_vanilla (gfc_array_r4 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r4_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r4_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@ -1734,6 +1734,24 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict retarray,

 #endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma3 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto(matmul_r8_avx128_fma3);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+void
+matmul_r8_avx128_fma4 (gfc_array_r8 * const restrict retarray, 
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto(matmul_r8_avx128_fma4);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 static void
 matmul_r8_vanilla (gfc_array_r8 * const restrict retarray, 
@ -2332,6 +2350,26 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_r8_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_r8_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/generated/matmulavx128_c10.c
+++ b/libgfortran/generated/matmulavx128_c10.c
--- a/libgfortran/generated/matmulavx128_c16.c
+++ b/libgfortran/generated/matmulavx128_c16.c
--- a/libgfortran/generated/matmulavx128_c4.c
+++ b/libgfortran/generated/matmulavx128_c4.c
--- a/libgfortran/generated/matmulavx128_c8.c
+++ b/libgfortran/generated/matmulavx128_c8.c
--- a/libgfortran/generated/matmulavx128_i1.c
+++ b/libgfortran/generated/matmulavx128_i1.c
--- a/libgfortran/generated/matmulavx128_i16.c
+++ b/libgfortran/generated/matmulavx128_i16.c
--- a/libgfortran/generated/matmulavx128_i2.c
+++ b/libgfortran/generated/matmulavx128_i2.c
--- a/libgfortran/generated/matmulavx128_i4.c
+++ b/libgfortran/generated/matmulavx128_i4.c
--- a/libgfortran/generated/matmulavx128_i8.c
+++ b/libgfortran/generated/matmulavx128_i8.c
--- a/libgfortran/generated/matmulavx128_r10.c
+++ b/libgfortran/generated/matmulavx128_r10.c
--- a/libgfortran/generated/matmulavx128_r16.c
+++ b/libgfortran/generated/matmulavx128_r16.c
--- a/libgfortran/generated/matmulavx128_r4.c
+++ b/libgfortran/generated/matmulavx128_r4.c
--- a/libgfortran/generated/matmulavx128_r8.c
+++ b/libgfortran/generated/matmulavx128_r8.c
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@ -106,6 +106,26 @@ static' include(matmul_internal.m4)dnl
 static' include(matmul_internal.m4)dnl
 `#endif  /* HAVE_AVX512F */

+/* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+#endif
+
 /* Function to fall back to if there is no special processor-specific version.  */
 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
 `static' include(matmul_internal.m4)dnl
@ -161,6 +181,26 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
 	    }
 #endif  /* HAVE_AVX */
        }
+    else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
+      {
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	    && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
+	  {
+            matmul_fn = matmul_'rtype_code`_avx128_fma3;
+	    goto store;
+	  }
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+        if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
+	     && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
+	  {
+            matmul_fn = matmul_'rtype_code`_avx128_fma4;
+	    goto store;
+	  }
+#endif
+
+      }
   store:
      __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
   }
--- a/libgfortran/m4/matmulavx128.m4
+++ b/libgfortran/m4/matmulavx128.m4
@ -0,0 +1,67 @@
+`/* Implementation of the MATMUL intrinsic
+   Copyright (C) 2002-2017 Free Software Foundation, Inc.
+   Contributed by Thomas Koenig <tkoenig@gcc.gnu.org>.
+
+This file is part of the GNU Fortran runtime library (libgfortran).
+
+Libgfortran is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public
+License as published by the Free Software Foundation; either
+version 3 of the License, or (at your option) any later version.
+
+Libgfortran is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "libgfortran.h"
+#include <string.h>
+#include <assert.h>'
+
+include(iparm.m4)dnl
+
+/* These are the specific versions of matmul with -mprefer-avx128.  */
+
+`#if defined (HAVE_'rtype_name`)
+
+/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
+   passed to us by the front-end, in which case we call it for large
+   matrices.  */
+
+typedef void (*blas_call)(const char *, const char *, const int *, const int *,
+                          const int *, const 'rtype_name` *, const 'rtype_name` *,
+                          const int *, const 'rtype_name` *, const int *,
+                          const 'rtype_name` *, 'rtype_name` *, const int *,
+                          int, int);
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
+'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
+`void
+'matmul_name` ('rtype` * const restrict retarray, 
+	'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
+	int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
+internal_proto('matmul_name`);
+'include(matmul_internal.m4)dnl
+`#endif
+
+#endif
+'