re PR fortran/26025 (Optionally use BLAS for matmul)

PR fortran/26025 * lang.opt: Add -fexternal-blas and -fblas-matmul-limit options. * options.c (gfc_init_options): Initialize new flags. (gfc_handle_option): Handle new flags. * gfortran.h (gfc_option): Add flag_external_blas and blas_matmul_limit flags. * trans-expr.c (gfc_conv_function_call): Use new argument append_args, appending it at the end of the argument list built for a function call. * trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new append_args argument to gfc_trans_call. * trans.h (gfc_conv_function_call): Update prototype. * trans-decl.c (gfc_build_intrinsic_function_decls): Add prototypes for BLAS ?gemm routines. * trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the extra arguments given to the library matmul function, and give them to gfc_conv_function_call. * invoke.texi: Add documentation for -fexternal-blas and -fblas-matmul-limit. * m4/matmul.m4: Add possible call to gemm routine. * generated/matmul_r8.c: Regenerate. * generated/matmul_r16.c: Regenerate. * generated/matmul_c8.c: Regenerate. * generated/matmul_i8.c: Regenerate. * generated/matmul_c16.c: Regenerate. * generated/matmul_r10.c: Regenerate. * generated/matmul_r4.c: Regenerate. * generated/matmul_c10.c: Regenerate. * generated/matmul_c4.c: Regenerate. * generated/matmul_i4.c: Regenerate. * generated/matmul_i16.c: Regenerate. From-SVN: r117948
2006-10-22 09:41:48 +02:00 · 2006-10-22 09:41:48 +02:00 · 5a0aad3165
parent d4f6a8099d
commit 5a0aad3165
23 changed files with 726 additions and 43 deletions
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@ -1,3 +1,25 @@
 2006-10-22  Francois-Xavier Coudert  <coudert@clipper.ens.fr>
 	PR fortran/26025
 	* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
 	* options.c (gfc_init_options): Initialize new flags.
 	(gfc_handle_option): Handle new flags.
 	* gfortran.h (gfc_option): Add flag_external_blas and
 	blas_matmul_limit flags.
 	* trans-expr.c (gfc_conv_function_call): Use new argument
 	append_args, appending it at the end of the argument list
 	built for a function call.
 	* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
 	append_args argument to gfc_trans_call.
 	* trans.h (gfc_conv_function_call): Update prototype.
 	* trans-decl.c (gfc_build_intrinsic_function_decls): Add
 	prototypes for BLAS ?gemm routines.
 	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
 	extra arguments given to the library matmul function, and give
 	them to gfc_conv_function_call.
 	* invoke.texi: Add documentation for -fexternal-blas and
 	-fblas-matmul-limit.
 2006-10-21  Kaveh R. Ghazi  <ghazi@caip.rutgers.edu>
 	* Make-lang.in (F95_LIBS): Delete.
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@ -1652,6 +1652,8 @@ typedef struct
  int flag_f2c;
  int flag_automatic;
  int flag_backslash;
  int flag_external_blas;
  int blas_matmul_limit;
  int flag_cray_pointer;
  int flag_d_lines;
  int flag_openmp;
--- a/gcc/fortran/invoke.texi
+++ b/gcc/fortran/invoke.texi
@ -152,7 +152,8 @@ by type.  Explanations are in the following sections.
@gccoptlist{
 -fno-automatic -ff2c -fno-underscoring  -fsecond-underscore @gol
 -fbounds-check  -fmax-stack-var-size=@var{n} @gol
-fpack-derived  -frepack-arrays  -fshort-enums}
+-fpack-derived  -frepack-arrays  -fshort-enums -fexternal-blas
 -fblas-matmul-limit=@var{n}}
@end table
@menu
@ -859,6 +860,27 @@ This option is provided for interoperability with C code that was
 compiled with the @command{-fshort-enums} option.  It will make
 GNU Fortran choose the smallest @code{INTEGER} kind a given
 enumerator set will fit in, and give all its enumerators this kind.
@cindex -fexternal-blas
@item -fexternal-blas
 This option will make gfortran generate calls to BLAS functions for some
 matrix operations like @code{MATMUL}, instead of using our own
 algorithms, if the size of the matrices involved is larger than a given
 limit (see @command{-fblas-matmul-limit}).  This may be profitable if an
 optimized vendor BLAS library is available.  The BLAS library will have
 to be specified at link time.
@cindex -fblas-matmul-limit
@item -fblas-matmul-limit=@var{n}
 Only significant when @command{-fexternal-blas} is in effect.
 Matrix multiplication of matrices with size larger than (or equal to) @var{n}
 will be performed by calls to BLAS functions, while others will be
 handled by @command{gfortran} internal algorithms. If the matrices
 involved are not square, the size comparison is performed using the
 geometric mean of the dimensions of the argument and result matrices.
 The default value for @var{n} is 30.
@end table
@xref{Code Gen Options,,Options for Code Generation Conventions,
--- a/gcc/fortran/lang.opt
+++ b/gcc/fortran/lang.opt
@ -85,6 +85,14 @@ fbackslash
 Fortran
 Specify that backslash in string introduces an escape character
 fexternal-blas
 Fortran
 Specify that an external BLAS library should be used for matmul calls on large-size arrays
 fblas-matmul-limit=
 Fortran RejectNegative Joined UInteger
 -fblas-matmul-limit=<n>        Size of the smallest matrix for which matmul will use BLAS
 fdefault-double-8
 Fortran
 Set the default double precision kind to an 8 byte wide type
--- a/gcc/fortran/options.c
+++ b/gcc/fortran/options.c
@ -80,6 +80,8 @@ gfc_init_options (unsigned int argc ATTRIBUTE_UNUSED,
  gfc_option.flag_preprocessed = 0;
  gfc_option.flag_automatic = 1;
  gfc_option.flag_backslash = 1;
  gfc_option.flag_external_blas = 0;
  gfc_option.blas_matmul_limit = 30;
  gfc_option.flag_cray_pointer = 0;
  gfc_option.flag_d_lines = -1;
  gfc_option.flag_openmp = 0;
@ -450,6 +452,14 @@ gfc_handle_option (size_t scode, const char *arg, int value)
      gfc_option.flag_dollar_ok = value;
      break;
    case OPT_fexternal_blas:
      gfc_option.flag_external_blas = value;
      break;
    case OPT_fblas_matmul_limit_:
      gfc_option.blas_matmul_limit = value;
      break;
    case OPT_fd_lines_as_code:
      gfc_option.flag_d_lines = 1;
      break;
--- a/gcc/fortran/trans-decl.c
+++ b/gcc/fortran/trans-decl.c
@ -143,6 +143,12 @@ tree gfor_fndecl_iargc;
 tree gfor_fndecl_si_kind;
 tree gfor_fndecl_sr_kind;
 /* BLAS gemm functions.  */
 tree gfor_fndecl_sgemm;
 tree gfor_fndecl_dgemm;
 tree gfor_fndecl_cgemm;
 tree gfor_fndecl_zgemm;
 static void
 gfc_add_decl_to_parent_function (tree decl)
@ -2186,6 +2192,49 @@ gfc_build_intrinsic_function_decls (void)
 				       gfc_int4_type_node, 1,
 				       gfc_real16_type_node);
  /* BLAS functions.  */
  {
    tree pint = build_pointer_type (gfc_c_int_type_node);
    tree ps = build_pointer_type (gfc_get_real_type (gfc_default_real_kind));
    tree pd = build_pointer_type (gfc_get_real_type (gfc_default_double_kind));
    tree pc = build_pointer_type (gfc_get_complex_type (gfc_default_real_kind));
    tree pz = build_pointer_type
 		(gfc_get_complex_type (gfc_default_double_kind));
    gfor_fndecl_sgemm = gfc_build_library_function_decl
 			  (get_identifier
 			     (gfc_option.flag_underscoring ? "sgemm_"
 							   : "sgemm"),
 			   void_type_node, 15, pchar_type_node,
 			   pchar_type_node, pint, pint, pint, ps, ps, pint,
 			   ps, pint, ps, ps, pint, gfc_c_int_type_node,
 			   gfc_c_int_type_node);
    gfor_fndecl_dgemm = gfc_build_library_function_decl
 			  (get_identifier
 			     (gfc_option.flag_underscoring ? "dgemm_"
 							   : "dgemm"),
 			   void_type_node, 15, pchar_type_node,
 			   pchar_type_node, pint, pint, pint, pd, pd, pint,
 			   pd, pint, pd, pd, pint, gfc_c_int_type_node,
 			   gfc_c_int_type_node);
    gfor_fndecl_cgemm = gfc_build_library_function_decl
 			  (get_identifier
 			     (gfc_option.flag_underscoring ? "cgemm_"
 							   : "cgemm"),
 			   void_type_node, 15, pchar_type_node,
 			   pchar_type_node, pint, pint, pint, pc, pc, pint,
 			   pc, pint, pc, pc, pint, gfc_c_int_type_node,
 			   gfc_c_int_type_node);
    gfor_fndecl_zgemm = gfc_build_library_function_decl
 			  (get_identifier
 			     (gfc_option.flag_underscoring ? "zgemm_"
 							   : "zgemm"),
 			   void_type_node, 15, pchar_type_node,
 			   pchar_type_node, pint, pint, pint, pz, pz, pint,
 			   pz, pint, pz, pz, pint, gfc_c_int_type_node,
 			   gfc_c_int_type_node);
  }
  /* Other functions.  */
  gfor_fndecl_size0 =
    gfc_build_library_function_decl (get_identifier (PREFIX("size0")),
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@ -1853,7 +1853,7 @@ is_aliased_array (gfc_expr * e)
 int
 gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
-			gfc_actual_arglist * arg)
+			gfc_actual_arglist * arg, tree append_args)
 {
  gfc_interface_mapping mapping;
  tree arglist;
@ -2226,6 +2226,11 @@ gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
  /* Add the hidden string length parameters to the arguments.  */
  arglist = chainon (arglist, stringargs);
  /* We may want to append extra arguments here.  This is used e.g. for
     calls to libgfortran_matmul_??, which need extra information.  */
  if (append_args != NULL_TREE)
    arglist = chainon (arglist, append_args);
  /* Generate the actual call.  */
  gfc_conv_function_val (se, sym);
  /* If there are alternate return labels, function type should be
@ -2545,7 +2550,7 @@ gfc_conv_function_expr (gfc_se * se, gfc_expr * expr)
  sym = expr->value.function.esym;
  if (!sym)
    sym = expr->symtree->n.sym;
-  gfc_conv_function_call (se, sym, expr->value.function.actual);
+  gfc_conv_function_call (se, sym, expr->value.function.actual, NULL_TREE);
 }
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@ -1378,6 +1378,7 @@ static void
 gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
 {
  gfc_symbol *sym;
  tree append_args;
  gcc_assert (!se->ss || se->ss->expr == expr);
@ -1387,7 +1388,54 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
    gcc_assert (expr->rank == 0);
  sym = gfc_get_symbol_for_expr (expr);
-  gfc_conv_function_call (se, sym, expr->value.function.actual);
+
  /* Calls to libgfortran_matmul need to be appended special arguments,
     to be able to call the BLAS ?gemm functions if required and possible.  */
  append_args = NULL_TREE;
  if (expr->value.function.isym->generic_id == GFC_ISYM_MATMUL
      && sym->ts.type != BT_LOGICAL)
    {
      tree cint = gfc_get_int_type (gfc_c_int_kind);
      if (gfc_option.flag_external_blas
 	  && (sym->ts.type == BT_REAL || sym->ts.type == BT_COMPLEX)
 	  && (sym->ts.kind == gfc_default_real_kind
 	      || sym->ts.kind == gfc_default_double_kind))
 	{
 	  tree gemm_fndecl;
 	  if (sym->ts.type == BT_REAL)
 	    {
 	      if (sym->ts.kind == gfc_default_real_kind)
 		gemm_fndecl = gfor_fndecl_sgemm;
 	      else
 		gemm_fndecl = gfor_fndecl_dgemm;
 	    }
 	  else
 	    {
 	      if (sym->ts.kind == gfc_default_real_kind)
 		gemm_fndecl = gfor_fndecl_cgemm;
 	      else
 		gemm_fndecl = gfor_fndecl_zgemm;
 	    }
 	  append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 1));
 	  append_args = gfc_chainon_list
 			  (append_args, build_int_cst
 					  (cint, gfc_option.blas_matmul_limit));
 	  append_args = gfc_chainon_list (append_args,
 					  gfc_build_addr_expr (NULL_TREE,
 							       gemm_fndecl));
 	}
      else
 	{
 	  append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 0));
 	  append_args = gfc_chainon_list (append_args, build_int_cst (cint, 0));
 	  append_args = gfc_chainon_list (append_args, null_pointer_node);
 	}
    }
  gfc_conv_function_call (se, sym, expr->value.function.actual, append_args);
  gfc_free (sym);
 }
--- a/gcc/fortran/trans-stmt.c
+++ b/gcc/fortran/trans-stmt.c
@ -334,7 +334,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
      /* Translate the call.  */
      has_alternate_specifier
-	= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual);
+	= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual,
 				  NULL_TREE);
      /* A subroutine without side-effect, by definition, does nothing!  */
      TREE_SIDE_EFFECTS (se.expr) = 1;
@ -399,7 +400,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
      gfc_init_block (&block);
      /* Add the subroutine call to the block.  */
-      gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual);
+      gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual,
 			      NULL_TREE);
      gfc_add_expr_to_block (&loopse.pre, loopse.expr);
      gfc_add_block_to_block (&block, &loopse.pre);
--- a/gcc/fortran/trans.h
+++ b/gcc/fortran/trans.h
@ -303,7 +303,8 @@ void gfc_conv_intrinsic_function (gfc_se *, gfc_expr *);
 int gfc_is_intrinsic_libcall (gfc_expr *);
 /* Also used to CALL subroutines.  */
-int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *);
+int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *,
 			    tree);
 /* gfc_trans_* shouldn't call push/poplevel, use gfc_push/pop_scope */
 /* Generate code for a scalar assignment.  */
@ -507,6 +508,12 @@ extern GTY(()) tree gfor_fndecl_math_exponent8;
 extern GTY(()) tree gfor_fndecl_math_exponent10;
 extern GTY(()) tree gfor_fndecl_math_exponent16;
 /* BLAS functions.  */
 extern GTY(()) tree gfor_fndecl_sgemm;
 extern GTY(()) tree gfor_fndecl_dgemm;
 extern GTY(()) tree gfor_fndecl_cgemm;
 extern GTY(()) tree gfor_fndecl_zgemm;
 /* String functions.  */
 extern GTY(()) tree gfor_fndecl_compare_string;
 extern GTY(()) tree gfor_fndecl_concat_string;
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@ -1,3 +1,19 @@
 2006-10-22  Francois-Xavier Coudert  <coudert@clipper.ens.fr>
 	PR fortran/26025
 	* m4/matmul.m4: Add possible call to gemm routine.
 	* generated/matmul_r8.c: Regenerate.
 	* generated/matmul_r16.c: Regenerate.
 	* generated/matmul_c8.c: Regenerate.
 	* generated/matmul_i8.c: Regenerate.
 	* generated/matmul_c16.c: Regenerate.
 	* generated/matmul_r10.c: Regenerate.
 	* generated/matmul_r4.c: Regenerate.
 	* generated/matmul_c10.c: Regenerate.
 	* generated/matmul_c4.c: Regenerate.
 	* generated/matmul_i4.c: Regenerate.
 	* generated/matmul_i16.c: Regenerate.
 2006-10-21  Steven G. Kargl  <kargl@gcc.gnu.org>
 	* runtime/error.c: Add errno.h
--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_10)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
                          const int *, const GFC_COMPLEX_10 *, const int *,
                          const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_c10 (gfc_array_c10 * const restrict retarray, 
-	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b);
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c10);
 void
 matmul_c10 (gfc_array_c10 * const restrict retarray, 
-	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b)
+	gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_COMPLEX_10 * restrict abase;
  const GFC_COMPLEX_10 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_COMPLEX_10 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_10 * restrict bbase_y;
--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_16)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
                          const int *, const GFC_COMPLEX_16 *, const int *,
                          const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_c16 (gfc_array_c16 * const restrict retarray, 
-	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b);
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c16);
 void
 matmul_c16 (gfc_array_c16 * const restrict retarray, 
-	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b)
+	gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_COMPLEX_16 * restrict abase;
  const GFC_COMPLEX_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_COMPLEX_16 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_16 * restrict bbase_y;
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_4)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
                          const int *, const GFC_COMPLEX_4 *, const int *,
                          const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_c4 (gfc_array_c4 * const restrict retarray, 
-	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b);
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c4);
 void
 matmul_c4 (gfc_array_c4 * const restrict retarray, 
-	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b)
+	gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_COMPLEX_4 * restrict abase;
  const GFC_COMPLEX_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_COMPLEX_4 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_4 * restrict bbase_y;
--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_8)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
                          const int *, const GFC_COMPLEX_8 *, const int *,
                          const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_c8 (gfc_array_c8 * const restrict retarray, 
-	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b);
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_c8);
 void
 matmul_c8 (gfc_array_c8 * const restrict retarray, 
-	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b)
+	gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_COMPLEX_8 * restrict abase;
  const GFC_COMPLEX_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_COMPLEX_8 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_8 * restrict bbase_y;
--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_16)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
                          const int *, const GFC_INTEGER_16 *, const int *,
                          const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_i16 (gfc_array_i16 * const restrict retarray, 
-	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b);
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i16);
 void
 matmul_i16 (gfc_array_i16 * const restrict retarray, 
-	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b)
+	gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_INTEGER_16 * restrict abase;
  const GFC_INTEGER_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_INTEGER_16 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_16 * restrict bbase_y;
--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_4)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
                          const int *, const GFC_INTEGER_4 *, const int *,
                          const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_i4 (gfc_array_i4 * const restrict retarray, 
-	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b);
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i4);
 void
 matmul_i4 (gfc_array_i4 * const restrict retarray, 
-	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b)
+	gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_INTEGER_4 * restrict abase;
  const GFC_INTEGER_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_INTEGER_4 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_4 * restrict bbase_y;
--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_8)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
                          const int *, const GFC_INTEGER_8 *, const int *,
                          const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_i8 (gfc_array_i8 * const restrict retarray, 
-	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b);
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_i8);
 void
 matmul_i8 (gfc_array_i8 * const restrict retarray, 
-	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b)
+	gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_INTEGER_8 * restrict abase;
  const GFC_INTEGER_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_INTEGER_8 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_8 * restrict bbase_y;
--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_10)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
                          const int *, const GFC_REAL_10 *, const int *,
                          const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_r10 (gfc_array_r10 * const restrict retarray, 
-	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b);
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r10);
 void
 matmul_r10 (gfc_array_r10 * const restrict retarray, 
-	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b)
+	gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_REAL_10 * restrict abase;
  const GFC_REAL_10 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_REAL_10 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_REAL_10 * restrict bbase_y;
--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_16)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
                          const int *, const GFC_REAL_16 *, const int *,
                          const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_r16 (gfc_array_r16 * const restrict retarray, 
-	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b);
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r16);
 void
 matmul_r16 (gfc_array_r16 * const restrict retarray, 
-	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b)
+	gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_REAL_16 * restrict abase;
  const GFC_REAL_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_REAL_16 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_REAL_16 * restrict bbase_y;
--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_4)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
                          const int *, const GFC_REAL_4 *, const int *,
                          const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_r4 (gfc_array_r4 * const restrict retarray, 
-	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b);
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r4);
 void
 matmul_r4 (gfc_array_r4 * const restrict retarray, 
-	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b)
+	gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_REAL_4 * restrict abase;
  const GFC_REAL_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_REAL_4 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_REAL_4 * restrict bbase_y;
--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_8)
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
                          const int *, const GFC_REAL_8 *, const int *,
                          const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA.  */
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_r8 (gfc_array_r8 * const restrict retarray, 
-	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b);
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_r8);
 void
 matmul_r8 (gfc_array_r8 * const restrict retarray, 
-	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b)
+	gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const GFC_REAL_8 * restrict abase;
  const GFC_REAL_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const GFC_REAL_8 one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const GFC_REAL_8 * restrict bbase_y;
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@ -37,6 +37,16 @@ include(iparm.m4)dnl
 `#if defined (HAVE_'rtype_name`)'
 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
   passed to us by the front-end, in which case we'll call it for large
   matrices.  */
 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
                          const int *, const rtype_name *, const rtype_name *,
                          const int *, const rtype_name *, const int *,
                          const rtype_name *, rtype_name *, const int *,
                          int, int);
 /* The order of loops is different in the case of plain matrix
   multiplication C=MATMUL(A,B), and in the frequent special case where
   the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -57,18 +67,24 @@ include(iparm.m4)dnl
       DO I=1,M
         S = 0
         DO K=1,COUNT
-           S = S+A(I,K)+B(K,J)
+           S = S+A(I,K)*B(K,J)
         C(I,J) = S
   ENDIF
 */
 /* If try_blas is set to a nonzero value, then the matmul function will
   see if there is a way to perform the matrix multiplication by a call
   to the BLAS gemm function.  */
 extern void matmul_`'rtype_code (rtype * const restrict retarray, 
-	rtype * const restrict a, rtype * const restrict b);
+	rtype * const restrict a, rtype * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm);
 export_proto(matmul_`'rtype_code);
 void
 matmul_`'rtype_code (rtype * const restrict retarray, 
-	rtype * const restrict a, rtype * const restrict b)
+	rtype * const restrict a, rtype * const restrict b, int try_blas,
 	int blas_limit, blas_call gemm)
 {
  const rtype_name * restrict abase;
  const rtype_name * restrict bbase;
@ -179,6 +195,31 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
  bbase = b->data;
  dest = retarray->data;
  /* Now that everything is set up, we're performing the multiplication
     itself.  */
 #define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
  if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
      && (bxstride == 1 || bystride == 1)
      && (((float) xcount) * ((float) ycount) * ((float) count)
          > POW3(blas_limit)))
  {
    const int m = xcount, n = ycount, k = count, ldc = rystride;
    const rtype_name one = 1, zero = 0;
    const int lda = (axstride == 1) ? aystride : axstride,
              ldb = (bxstride == 1) ? bystride : bxstride;
    if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
      {
        assert (gemm != NULL);
        gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
              &one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
        return;
      }
  }
  if (rxstride == 1 && axstride == 1 && bxstride == 1)
    {
      const rtype_name * restrict bbase_y;