Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h.

gcc/fortran/ * Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h. * gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag. * dependency.h (gfc_get_noncopying_intrinsic_argument): Declare. (gfc_check_fncall_dependency): Change prototype. * dependency.c (gfc_get_noncopying_intrinsic_argument): New function. (gfc_check_argument_var_dependency): New function, split from gfc_check_fncall_dependency. (gfc_check_argument_dependency): New function. (gfc_check_fncall_dependency): Replace the expression parameter with separate symbol and argument list parameters. Generalize the function to handle dependencies for any type of expression, not just variables. Accept a further argument giving the intent of the expression being tested. Ignore intent(in) arguments if that expression is also intent(in). * resolve.c: Include dependency.h. (find_noncopying_intrinsics): New function. (resolve_function, resolve_call): Call it on success. * trans-array.h (gfc_conv_array_transpose): Declare. (gfc_check_fncall_dependency): Remove prototype. * trans-array.c (gfc_conv_array_transpose): New function. * trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the libcall handling if the expression is to be evaluated inline. Add a case for handling inline transpose()s. * trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new interface provided by gfc_check_fncall_dependency. libgfortran/ * m4/matmul.m4: Use a different order in the special case of a transposed first argument. * generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c, * generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c, * generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c * generated/matmul_r10.c, generated/matmul_r16.c: Regenerated. Co-Authored-By: Victor Leikehman <LEI@il.ibm.com> From-SVN: r108459
2005-12-13 05:23:12 +00:00 · 2005-12-13 05:23:12 +00:00 · 1524f80b1c
parent 264c41eda5
commit 1524f80b1c
23 changed files with 1051 additions and 146 deletions
--- a/gcc/fortran/ChangeLog
+++ b/gcc/fortran/ChangeLog
@ -1,3 +1,32 @@
 2005-12-13  Richard Sandiford  <richard@codesourcery.com>
 	* Make-lang.in (fortran/trans-resolve.o): Depend on
 	fortran/dependency.h.
 	* gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
 	* dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
 	(gfc_check_fncall_dependency): Change prototype.
 	* dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
 	(gfc_check_argument_var_dependency): New function, split from
 	gfc_check_fncall_dependency.
 	(gfc_check_argument_dependency): New function.
 	(gfc_check_fncall_dependency): Replace the expression parameter with
 	separate symbol and argument list parameters.  Generalize the function
 	to handle dependencies for any type of expression, not just variables.
 	Accept a further argument giving the intent of the expression being
 	tested.  Ignore	intent(in) arguments if that expression is also
 	intent(in).
 	* resolve.c: Include dependency.h.
 	(find_noncopying_intrinsics): New function.
 	(resolve_function, resolve_call): Call it on success.
 	* trans-array.h (gfc_conv_array_transpose): Declare.
 	(gfc_check_fncall_dependency): Remove prototype.
 	* trans-array.c (gfc_conv_array_transpose): New function.
 	* trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
 	libcall handling if the expression is to be evaluated inline.
 	Add a case for handling inline transpose()s.
 	* trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
 	interface provided by gfc_check_fncall_dependency.
 2005-12-12  Steven G. Kargl  <kargls@comcast.net>
 	PR fortran/25078
--- a/gcc/fortran/Make-lang.in
+++ b/gcc/fortran/Make-lang.in
@ -286,4 +286,5 @@ fortran/trans-intrinsic.o: $(GFORTRAN_TRANS_DEPS) fortran/mathbuiltins.def \
  gt-fortran-trans-intrinsic.h
 fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
 fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
 fortran/resolve.o: fortran/dependency.h
--- a/gcc/fortran/dependency.c
+++ b/gcc/fortran/dependency.c
@ -175,6 +175,32 @@ gfc_is_same_range (gfc_array_ref * ar1, gfc_array_ref * ar2, int n, int def)
 }
 /* Some array-returning intrinsics can be implemented by reusing the
   data from one of the array arguments.  For example, TRANPOSE does
   not necessarily need to allocate new data: it can be implemented
   by copying the original array's descriptor and simply swapping the
   two dimension specifications.
   If EXPR is a call to such an intrinsic, return the argument
   whose data can be reused, otherwise return NULL.  */
 gfc_expr *
 gfc_get_noncopying_intrinsic_argument (gfc_expr * expr)
 {
  if (expr->expr_type != EXPR_FUNCTION || !expr->value.function.isym)
    return NULL;
  switch (expr->value.function.isym->generic_id)
    {
    case GFC_ISYM_TRANSPOSE:
      return expr->value.function.actual->expr;
    default:
      return NULL;
    }
 }
 /* Return true if the result of reference REF can only be constructed
   using a temporary array.  */
@ -214,23 +240,82 @@ gfc_ref_needs_temporary_p (gfc_ref *ref)
 }
-/* Dependency checking for direct function return by reference.
+/* Return true if array variable VAR could be passed to the same function
-   Returns true if the arguments of the function depend on the
+   as argument EXPR without interfering with EXPR.  INTENT is the intent
-   destination.  This is considerably less conservative than other
+   of VAR.
-   dependencies because many function arguments will already be
+
-   copied into a temporary.  */
+   This is considerably less conservative than other dependencies
   because many function arguments will already be copied into a
   temporary.  */
 static int
 gfc_check_argument_var_dependency (gfc_expr * var, sym_intent intent,
 				   gfc_expr * expr)
 {
  gcc_assert (var->expr_type == EXPR_VARIABLE);
  gcc_assert (var->rank > 0);
  switch (expr->expr_type)
    {
    case EXPR_VARIABLE:
      return (gfc_ref_needs_temporary_p (expr->ref)
 	      || gfc_check_dependency (var, expr, NULL, 0));
    case EXPR_ARRAY:
      return gfc_check_dependency (var, expr, NULL, 0);
    case EXPR_FUNCTION:
      if (intent != INTENT_IN && expr->inline_noncopying_intrinsic)
 	{
 	  expr = gfc_get_noncopying_intrinsic_argument (expr);
 	  return gfc_check_argument_var_dependency (var, intent, expr);
 	}
      return 0;
    default:
      return 0;
    }
 }
 /* Like gfc_check_argument_var_dependency, but extended to any
   array expression OTHER, not just variables.  */
 static int
 gfc_check_argument_dependency (gfc_expr * other, sym_intent intent,
 			       gfc_expr * expr)
 {
  switch (other->expr_type)
    {
    case EXPR_VARIABLE:
      return gfc_check_argument_var_dependency (other, intent, expr);
    case EXPR_FUNCTION:
      if (other->inline_noncopying_intrinsic)
 	{
 	  other = gfc_get_noncopying_intrinsic_argument (other);
 	  return gfc_check_argument_dependency (other, INTENT_IN, expr);
 	}
      return 0;
    default:
      return 0;
    }
 }
 /* Like gfc_check_argument_dependency, but check all the arguments in ACTUAL.
   FNSYM is the function being called, or NULL if not known.  */
 int
-gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
+gfc_check_fncall_dependency (gfc_expr * other, sym_intent intent,
 			     gfc_symbol * fnsym, gfc_actual_arglist * actual)
 {
-  gfc_actual_arglist *actual;
+  gfc_formal_arglist *formal;
  gfc_expr *expr;
-  gcc_assert (dest->expr_type == EXPR_VARIABLE
+  formal = fnsym ? fnsym->formal : NULL;
-	  && fncall->expr_type == EXPR_FUNCTION);
+  for (; actual; actual = actual->next, formal = formal ? formal->next : NULL)
  gcc_assert (fncall->rank > 0);
  for (actual = fncall->value.function.actual; actual; actual = actual->next)
    {
      expr = actual->expr;
@ -238,23 +323,14 @@ gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
      if (!expr)
 	continue;
-      /* Non-variable expressions will be allocated temporaries anyway.  */
+      /* Skip intent(in) arguments if OTHER itself is intent(in).  */
-      switch (expr->expr_type)
+      if (formal
-	{
+	  && intent == INTENT_IN
-	case EXPR_VARIABLE:
+	  && formal->sym->attr.intent == INTENT_IN)
-	  if (!gfc_ref_needs_temporary_p (expr->ref)
+	continue;
 	      && gfc_check_dependency (dest, expr, NULL, 0))
 	    return 1;
 	  break;
-	case EXPR_ARRAY:
+      if (gfc_check_argument_dependency (other, intent, expr))
-	  if (gfc_check_dependency (dest, expr, NULL, 0))
+	return 1;
 	    return 1;
 	  break;
 	default:
 	  break;
 	}
    }
  return 0;
--- a/gcc/fortran/dependency.h
+++ b/gcc/fortran/dependency.h
@ -22,7 +22,9 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 bool gfc_ref_needs_temporary_p (gfc_ref *);
-int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
+gfc_expr *gfc_get_noncopying_intrinsic_argument (gfc_expr *);
 int gfc_check_fncall_dependency (gfc_expr *, sym_intent, gfc_symbol *,
 				 gfc_actual_arglist *);
 int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
 int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
 int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@ -1129,6 +1129,9 @@ typedef struct gfc_expr
  /* True if it is converted from Hollerith constant.  */
  unsigned int from_H : 1;
  /* True if the expression is a call to a function that returns an array,
     and if we have decided not to allocate temporary data for that array.  */
  unsigned int inline_noncopying_intrinsic : 1;
  union
  {
--- a/gcc/fortran/resolve.c
+++ b/gcc/fortran/resolve.c
@ -24,6 +24,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor,Boston, MA
 #include "system.h"
 #include "gfortran.h"
 #include "arith.h"  /* For gfc_compare_expr().  */
 #include "dependency.h"
 /* Types used in equivalence statements.  */
@ -804,6 +805,24 @@ resolve_actual_arglist (gfc_actual_arglist * arg)
 }
 /* Go through each actual argument in ACTUAL and see if it can be
   implemented as an inlined, non-copying intrinsic.  FNSYM is the
   function being called, or NULL if not known.  */
 static void
 find_noncopying_intrinsics (gfc_symbol * fnsym, gfc_actual_arglist * actual)
 {
  gfc_actual_arglist *ap;
  gfc_expr *expr;
  for (ap = actual; ap; ap = ap->next)
    if (ap->expr
 	&& (expr = gfc_get_noncopying_intrinsic_argument (ap->expr))
 	&& !gfc_check_fncall_dependency (expr, INTENT_IN, fnsym, actual))
      ap->expr->inline_noncopying_intrinsic = 1;
 }
 /************* Function resolution *************/
 /* Resolve a function call known to be generic.
@ -1150,6 +1169,9 @@ resolve_function (gfc_expr * expr)
 	}
    }
  if (t == SUCCESS)
    find_noncopying_intrinsics (expr->value.function.esym,
 				expr->value.function.actual);
  return t;
 }
@ -1372,27 +1394,28 @@ resolve_call (gfc_code * c)
  if (resolve_actual_arglist (c->ext.actual) == FAILURE)
    return FAILURE;
-  if (c->resolved_sym != NULL)
+  t = SUCCESS;
-    return SUCCESS;
+  if (c->resolved_sym == NULL)
    switch (procedure_kind (c->symtree->n.sym))
      {
      case PTYPE_GENERIC:
 	t = resolve_generic_s (c);
 	break;
-  switch (procedure_kind (c->symtree->n.sym))
+      case PTYPE_SPECIFIC:
-    {
+	t = resolve_specific_s (c);
-    case PTYPE_GENERIC:
+	break;
      t = resolve_generic_s (c);
      break;
-    case PTYPE_SPECIFIC:
+      case PTYPE_UNKNOWN:
-      t = resolve_specific_s (c);
+	t = resolve_unknown_s (c);
-      break;
+	break;
-    case PTYPE_UNKNOWN:
+      default:
-      t = resolve_unknown_s (c);
+	gfc_internal_error ("resolve_subroutine(): bad function type");
-      break;
+      }
    default:
      gfc_internal_error ("resolve_subroutine(): bad function type");
    }
  if (t == SUCCESS)
    find_noncopying_intrinsics (c->resolved_sym, c->ext.actual);
  return t;
 }
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@ -673,6 +673,95 @@ gfc_trans_allocate_temp_array (stmtblock_t * pre, stmtblock_t * post,
 }
 /* Generate code to tranpose array EXPR by creating a new descriptor
   in which the dimension specifications have been reversed.  */
 void
 gfc_conv_array_transpose (gfc_se * se, gfc_expr * expr)
 {
  tree dest, src, dest_index, src_index;
  gfc_loopinfo *loop;
  gfc_ss_info *dest_info, *src_info;
  gfc_ss *dest_ss, *src_ss;
  gfc_se src_se;
  int n;
  loop = se->loop;
  src_ss = gfc_walk_expr (expr);
  dest_ss = se->ss;
  src_info = &src_ss->data.info;
  dest_info = &dest_ss->data.info;
  /* Get a descriptor for EXPR.  */
  gfc_init_se (&src_se, NULL);
  gfc_conv_expr_descriptor (&src_se, expr, src_ss);
  gfc_add_block_to_block (&se->pre, &src_se.pre);
  gfc_add_block_to_block (&se->post, &src_se.post);
  src = src_se.expr;
  /* Allocate a new descriptor for the return value.  */
  dest = gfc_create_var (TREE_TYPE (src), "atmp");
  dest_info->descriptor = dest;
  se->expr = dest;
  /* Copy across the dtype field.  */
  gfc_add_modify_expr (&se->pre,
 		       gfc_conv_descriptor_dtype (dest),
 		       gfc_conv_descriptor_dtype (src));
  /* Copy the dimension information, renumbering dimension 1 to 0 and
     0 to 1.  */
  gcc_assert (dest_info->dimen == 2);
  gcc_assert (src_info->dimen == 2);
  for (n = 0; n < 2; n++)
    {
      dest_info->delta[n] = integer_zero_node;
      dest_info->start[n] = integer_zero_node;
      dest_info->stride[n] = integer_one_node;
      dest_info->dim[n] = n;
      dest_index = gfc_rank_cst[n];
      src_index = gfc_rank_cst[1 - n];
      gfc_add_modify_expr (&se->pre,
 			   gfc_conv_descriptor_stride (dest, dest_index),
 			   gfc_conv_descriptor_stride (src, src_index));
      gfc_add_modify_expr (&se->pre,
 			   gfc_conv_descriptor_lbound (dest, dest_index),
 			   gfc_conv_descriptor_lbound (src, src_index));
      gfc_add_modify_expr (&se->pre,
 			   gfc_conv_descriptor_ubound (dest, dest_index),
 			   gfc_conv_descriptor_ubound (src, src_index));
      if (!loop->to[n])
        {
 	  gcc_assert (integer_zerop (loop->from[n]));
 	  loop->to[n] = build2 (MINUS_EXPR, gfc_array_index_type,
 				gfc_conv_descriptor_ubound (dest, dest_index),
 				gfc_conv_descriptor_lbound (dest, dest_index));
        }
    }
  /* Copy the data pointer.  */
  dest_info->data = gfc_conv_descriptor_data_get (src);
  gfc_conv_descriptor_data_set (&se->pre, dest, dest_info->data);
  /* Copy the offset.  This is not changed by transposition: the top-left
     element is still at the same offset as before.  */
  dest_info->offset = gfc_conv_descriptor_offset (src);
  gfc_add_modify_expr (&se->pre,
 		       gfc_conv_descriptor_offset (dest),
 		       dest_info->offset);
  if (dest_info->dimen > loop->temp_dim)
    loop->temp_dim = dest_info->dimen;
 }
 /* Return the number of iterations in a loop that starts at START,
   ends at END, and has step STEP.  */
--- a/gcc/fortran/trans-array.h
+++ b/gcc/fortran/trans-array.h
@ -91,6 +91,8 @@ void gfc_conv_tmp_ref (gfc_se *);
 void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
 /* Convert an array for passing as an actual function parameter.  */
 void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
 /* Evaluate and transpose a matrix expression.  */
 void gfc_conv_array_transpose (gfc_se *, gfc_expr *);
 /* These work with both descriptors and descriptorless arrays.  */
 tree gfc_conv_array_data (tree);
@ -112,8 +114,6 @@ tree gfc_conv_descriptor_ubound (tree, tree);
 /* Dependency checking for WHERE and FORALL.  */
 int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
 /* Dependency checking for function calls.  */
 int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
 /* Add pre-loop scalarization code for intrinsic functions which require
   special handling.  */
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@ -2627,7 +2627,9 @@ gfc_trans_arrayfunc_assign (gfc_expr * expr1, gfc_expr * expr2)
    }
  /* Check for a dependency.  */
-  if (gfc_check_fncall_dependency (expr1, expr2))
+  if (gfc_check_fncall_dependency (expr1, INTENT_OUT,
 				   expr2->value.function.esym,
 				   expr2->value.function.actual))
    return NULL;
  /* The frontend doesn't seem to bother filling in expr->symtree for intrinsic
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@ -2894,7 +2894,7 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
  name = &expr->value.function.name[2];
-  if (expr->rank > 0)
+  if (expr->rank > 0 && !expr->inline_noncopying_intrinsic)
    {
      lib = gfc_is_intrinsic_libcall (expr);
      if (lib != 0)
@ -3119,6 +3119,16 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
      gfc_conv_intrinsic_bound (se, expr, 0);
      break;
    case GFC_ISYM_TRANSPOSE:
      if (se->ss && se->ss->useflags)
 	{
 	  gfc_conv_tmp_array_ref (se);
 	  gfc_advance_se_ss_chain (se);
 	}
      else
 	gfc_conv_array_transpose (se, expr->value.function.actual->expr);
      break;
    case GFC_ISYM_LEN:
      gfc_conv_intrinsic_len (se, expr);
      break;
--- a/libgfortran/ChangeLog
+++ b/libgfortran/ChangeLog
@ -1,3 +1,13 @@
 2005-12-13  Richard Sandiford  <richard@codesourcery.com>
 	    Victor Leikehman  <LEI@il.ibm.com>
 	* m4/matmul.m4: Use a different order in the special case of a
 	transposed first argument.
 	* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
 	* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
 	* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
 	* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.
 2005-12-10  Janne Blomqvist  <jb@gcc.gnu.org>
 	* Makefile.am: Enable loop unrolling for matmul.
--- a/libgfortran/generated/matmul_c10.c
+++ b/libgfortran/generated/matmul_c10.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_10)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_c10 (gfc_array_c10 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_10 *restrict abase_x;
      const GFC_COMPLEX_10 *restrict bbase_y;
      GFC_COMPLEX_10 *restrict dest_y;
      GFC_COMPLEX_10 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_10) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_COMPLEX_10 *restrict abase_x;
      const GFC_COMPLEX_10 *restrict bbase_y;
      GFC_COMPLEX_10 *restrict dest_y;
      GFC_COMPLEX_10 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_10) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_c16.c
+++ b/libgfortran/generated/matmul_c16.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_16)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_c16 (gfc_array_c16 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_16 *restrict abase_x;
      const GFC_COMPLEX_16 *restrict bbase_y;
      GFC_COMPLEX_16 *restrict dest_y;
      GFC_COMPLEX_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_COMPLEX_16 *restrict abase_x;
      const GFC_COMPLEX_16 *restrict bbase_y;
      GFC_COMPLEX_16 *restrict dest_y;
      GFC_COMPLEX_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_c4.c
+++ b/libgfortran/generated/matmul_c4.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_4)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_c4 (gfc_array_c4 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_4 *restrict abase_x;
      const GFC_COMPLEX_4 *restrict bbase_y;
      GFC_COMPLEX_4 *restrict dest_y;
      GFC_COMPLEX_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_COMPLEX_4 *restrict abase_x;
      const GFC_COMPLEX_4 *restrict bbase_y;
      GFC_COMPLEX_4 *restrict dest_y;
      GFC_COMPLEX_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_c8.c
+++ b/libgfortran/generated/matmul_c8.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_COMPLEX_8)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_c8 (gfc_array_c8 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_COMPLEX_8 *restrict abase_x;
      const GFC_COMPLEX_8 *restrict bbase_y;
      GFC_COMPLEX_8 *restrict dest_y;
      GFC_COMPLEX_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_COMPLEX_8 *restrict abase_x;
      const GFC_COMPLEX_8 *restrict bbase_y;
      GFC_COMPLEX_8 *restrict dest_y;
      GFC_COMPLEX_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_COMPLEX_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_i16.c
+++ b/libgfortran/generated/matmul_i16.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_16)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_i16 (gfc_array_i16 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_16 *restrict abase_x;
      const GFC_INTEGER_16 *restrict bbase_y;
      GFC_INTEGER_16 *restrict dest_y;
      GFC_INTEGER_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_INTEGER_16 *restrict abase_x;
      const GFC_INTEGER_16 *restrict bbase_y;
      GFC_INTEGER_16 *restrict dest_y;
      GFC_INTEGER_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_i4.c
+++ b/libgfortran/generated/matmul_i4.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_4)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_i4 (gfc_array_i4 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_4 *restrict abase_x;
      const GFC_INTEGER_4 *restrict bbase_y;
      GFC_INTEGER_4 *restrict dest_y;
      GFC_INTEGER_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_INTEGER_4 *restrict abase_x;
      const GFC_INTEGER_4 *restrict bbase_y;
      GFC_INTEGER_4 *restrict dest_y;
      GFC_INTEGER_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_i8.c
+++ b/libgfortran/generated/matmul_i8.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_INTEGER_8)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_i8 (gfc_array_i8 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_INTEGER_8 *restrict abase_x;
      const GFC_INTEGER_8 *restrict bbase_y;
      GFC_INTEGER_8 *restrict dest_y;
      GFC_INTEGER_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_INTEGER_8 *restrict abase_x;
      const GFC_INTEGER_8 *restrict bbase_y;
      GFC_INTEGER_8 *restrict dest_y;
      GFC_INTEGER_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_INTEGER_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_r10.c
+++ b/libgfortran/generated/matmul_r10.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_10)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_r10 (gfc_array_r10 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_REAL_10 *restrict abase_x;
      const GFC_REAL_10 *restrict bbase_y;
      GFC_REAL_10 *restrict dest_y;
      GFC_REAL_10 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_10) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_REAL_10 *restrict abase_x;
      const GFC_REAL_10 *restrict bbase_y;
      GFC_REAL_10 *restrict dest_y;
      GFC_REAL_10 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_10) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_r16.c
+++ b/libgfortran/generated/matmul_r16.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_16)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_r16 (gfc_array_r16 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_REAL_16 *restrict abase_x;
      const GFC_REAL_16 *restrict bbase_y;
      GFC_REAL_16 *restrict dest_y;
      GFC_REAL_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_REAL_16 *restrict abase_x;
      const GFC_REAL_16 *restrict bbase_y;
      GFC_REAL_16 *restrict dest_y;
      GFC_REAL_16 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_16) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_r4.c
+++ b/libgfortran/generated/matmul_r4.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_4)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_r4 (gfc_array_r4 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_REAL_4 *restrict abase_x;
      const GFC_REAL_4 *restrict bbase_y;
      GFC_REAL_4 *restrict dest_y;
      GFC_REAL_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_REAL_4 *restrict abase_x;
      const GFC_REAL_4 *restrict bbase_y;
      GFC_REAL_4 *restrict dest_y;
      GFC_REAL_4 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_4) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/generated/matmul_r8.c
+++ b/libgfortran/generated/matmul_r8.c
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA.  */
 #if defined (HAVE_GFC_REAL_8)
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_r8 (gfc_array_r8 * const restrict retarray, 
@ -204,7 +217,28 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const GFC_REAL_8 *restrict abase_x;
      const GFC_REAL_8 *restrict bbase_y;
      GFC_REAL_8 *restrict dest_y;
      GFC_REAL_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const GFC_REAL_8 *restrict abase_x;
      const GFC_REAL_8 *restrict bbase_y;
      GFC_REAL_8 *restrict dest_y;
      GFC_REAL_8 s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (GFC_REAL_8) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif
--- a/libgfortran/m4/matmul.m4
+++ b/libgfortran/m4/matmul.m4
@ -37,16 +37,29 @@ include(iparm.m4)dnl
 `#if defined (HAVE_'rtype_name`)'
-/* This is a C version of the following fortran pseudo-code. The key
+/* The order of loops is different in the case of plain matrix
-   point is the loop order -- we access all arrays column-first, which
+   multiplication C=MATMUL(A,B), and in the frequent special case where
-   improves the performance enough to boost galgel spec score by 50%.
+   the argument A is the temporary result of a TRANSPOSE intrinsic:
   C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
   looking at their strides.
   The equivalent Fortran pseudo-code is:
   DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
-   C = 0
+   IF (.NOT.IS_TRANSPOSED(A)) THEN
-   DO J=1,N
+     C = 0
-     DO K=1,COUNT
+     DO J=1,N
       DO K=1,COUNT
         DO I=1,M
           C(I,J) = C(I,J)+A(I,K)*B(K,J)
   ELSE
     DO J=1,N
       DO I=1,M
-         C(I,J) = C(I,J)+A(I,K)*B(K,J)
+         S = 0
         DO K=1,COUNT
           S = S+A(I,K)+B(K,J)
         C(I,J) = S
   ENDIF
 */
 extern void matmul_`'rtype_code (rtype * const restrict retarray, 
@ -206,7 +219,28 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
 	    }
 	}
    }
-  else
+  else if (rxstride == 1 && aystride == 1 && bxstride == 1)
    {
      const rtype_name *restrict abase_x;
      const rtype_name *restrict bbase_y;
      rtype_name *restrict dest_y;
      rtype_name s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (rtype_name) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n] * bbase_y[n];
 	      dest_y[x] = s;
 	    }
 	}
    }
  else if (axstride < aystride)
    {
      for (y = 0; y < ycount; y++)
 	for (x = 0; x < xcount; x++)
@ -218,6 +252,27 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
 	    /* dest[x,y] += a[x,n] * b[n,y] */
 	    dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
    }
  else
    {
      const rtype_name *restrict abase_x;
      const rtype_name *restrict bbase_y;
      rtype_name *restrict dest_y;
      rtype_name s;
      for (y = 0; y < ycount; y++)
 	{
 	  bbase_y = &bbase[y*bystride];
 	  dest_y = &dest[y*rystride];
 	  for (x = 0; x < xcount; x++)
 	    {
 	      abase_x = &abase[x*axstride];
 	      s = (rtype_name) 0;
 	      for (n = 0; n < count; n++)
 		s += abase_x[n*aystride] * bbase_y[n*bxstride];
 	      dest_y[x*rxstride] = s;
 	    }
 	}
    }
 }
 #endif