mirror of git://gcc.gnu.org/git/gcc.git
Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h.
gcc/fortran/ * Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h. * gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag. * dependency.h (gfc_get_noncopying_intrinsic_argument): Declare. (gfc_check_fncall_dependency): Change prototype. * dependency.c (gfc_get_noncopying_intrinsic_argument): New function. (gfc_check_argument_var_dependency): New function, split from gfc_check_fncall_dependency. (gfc_check_argument_dependency): New function. (gfc_check_fncall_dependency): Replace the expression parameter with separate symbol and argument list parameters. Generalize the function to handle dependencies for any type of expression, not just variables. Accept a further argument giving the intent of the expression being tested. Ignore intent(in) arguments if that expression is also intent(in). * resolve.c: Include dependency.h. (find_noncopying_intrinsics): New function. (resolve_function, resolve_call): Call it on success. * trans-array.h (gfc_conv_array_transpose): Declare. (gfc_check_fncall_dependency): Remove prototype. * trans-array.c (gfc_conv_array_transpose): New function. * trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the libcall handling if the expression is to be evaluated inline. Add a case for handling inline transpose()s. * trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new interface provided by gfc_check_fncall_dependency. libgfortran/ * m4/matmul.m4: Use a different order in the special case of a transposed first argument. * generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c, * generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c, * generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c * generated/matmul_r10.c, generated/matmul_r16.c: Regenerated. Co-Authored-By: Victor Leikehman <LEI@il.ibm.com> From-SVN: r108459
This commit is contained in:
parent
264c41eda5
commit
1524f80b1c
|
|
@ -1,3 +1,32 @@
|
||||||
|
2005-12-13 Richard Sandiford <richard@codesourcery.com>
|
||||||
|
|
||||||
|
* Make-lang.in (fortran/trans-resolve.o): Depend on
|
||||||
|
fortran/dependency.h.
|
||||||
|
* gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
|
||||||
|
* dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
|
||||||
|
(gfc_check_fncall_dependency): Change prototype.
|
||||||
|
* dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
|
||||||
|
(gfc_check_argument_var_dependency): New function, split from
|
||||||
|
gfc_check_fncall_dependency.
|
||||||
|
(gfc_check_argument_dependency): New function.
|
||||||
|
(gfc_check_fncall_dependency): Replace the expression parameter with
|
||||||
|
separate symbol and argument list parameters. Generalize the function
|
||||||
|
to handle dependencies for any type of expression, not just variables.
|
||||||
|
Accept a further argument giving the intent of the expression being
|
||||||
|
tested. Ignore intent(in) arguments if that expression is also
|
||||||
|
intent(in).
|
||||||
|
* resolve.c: Include dependency.h.
|
||||||
|
(find_noncopying_intrinsics): New function.
|
||||||
|
(resolve_function, resolve_call): Call it on success.
|
||||||
|
* trans-array.h (gfc_conv_array_transpose): Declare.
|
||||||
|
(gfc_check_fncall_dependency): Remove prototype.
|
||||||
|
* trans-array.c (gfc_conv_array_transpose): New function.
|
||||||
|
* trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
|
||||||
|
libcall handling if the expression is to be evaluated inline.
|
||||||
|
Add a case for handling inline transpose()s.
|
||||||
|
* trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
|
||||||
|
interface provided by gfc_check_fncall_dependency.
|
||||||
|
|
||||||
2005-12-12 Steven G. Kargl <kargls@comcast.net>
|
2005-12-12 Steven G. Kargl <kargls@comcast.net>
|
||||||
|
|
||||||
PR fortran/25078
|
PR fortran/25078
|
||||||
|
|
|
||||||
|
|
@ -286,4 +286,5 @@ fortran/trans-intrinsic.o: $(GFORTRAN_TRANS_DEPS) fortran/mathbuiltins.def \
|
||||||
gt-fortran-trans-intrinsic.h
|
gt-fortran-trans-intrinsic.h
|
||||||
fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
|
fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
|
||||||
fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
|
fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
|
||||||
|
fortran/resolve.o: fortran/dependency.h
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -175,6 +175,32 @@ gfc_is_same_range (gfc_array_ref * ar1, gfc_array_ref * ar2, int n, int def)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Some array-returning intrinsics can be implemented by reusing the
|
||||||
|
data from one of the array arguments. For example, TRANPOSE does
|
||||||
|
not necessarily need to allocate new data: it can be implemented
|
||||||
|
by copying the original array's descriptor and simply swapping the
|
||||||
|
two dimension specifications.
|
||||||
|
|
||||||
|
If EXPR is a call to such an intrinsic, return the argument
|
||||||
|
whose data can be reused, otherwise return NULL. */
|
||||||
|
|
||||||
|
gfc_expr *
|
||||||
|
gfc_get_noncopying_intrinsic_argument (gfc_expr * expr)
|
||||||
|
{
|
||||||
|
if (expr->expr_type != EXPR_FUNCTION || !expr->value.function.isym)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
switch (expr->value.function.isym->generic_id)
|
||||||
|
{
|
||||||
|
case GFC_ISYM_TRANSPOSE:
|
||||||
|
return expr->value.function.actual->expr;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Return true if the result of reference REF can only be constructed
|
/* Return true if the result of reference REF can only be constructed
|
||||||
using a temporary array. */
|
using a temporary array. */
|
||||||
|
|
||||||
|
|
@ -214,23 +240,82 @@ gfc_ref_needs_temporary_p (gfc_ref *ref)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Dependency checking for direct function return by reference.
|
/* Return true if array variable VAR could be passed to the same function
|
||||||
Returns true if the arguments of the function depend on the
|
as argument EXPR without interfering with EXPR. INTENT is the intent
|
||||||
destination. This is considerably less conservative than other
|
of VAR.
|
||||||
dependencies because many function arguments will already be
|
|
||||||
copied into a temporary. */
|
This is considerably less conservative than other dependencies
|
||||||
|
because many function arguments will already be copied into a
|
||||||
|
temporary. */
|
||||||
|
|
||||||
|
static int
|
||||||
|
gfc_check_argument_var_dependency (gfc_expr * var, sym_intent intent,
|
||||||
|
gfc_expr * expr)
|
||||||
|
{
|
||||||
|
gcc_assert (var->expr_type == EXPR_VARIABLE);
|
||||||
|
gcc_assert (var->rank > 0);
|
||||||
|
|
||||||
|
switch (expr->expr_type)
|
||||||
|
{
|
||||||
|
case EXPR_VARIABLE:
|
||||||
|
return (gfc_ref_needs_temporary_p (expr->ref)
|
||||||
|
|| gfc_check_dependency (var, expr, NULL, 0));
|
||||||
|
|
||||||
|
case EXPR_ARRAY:
|
||||||
|
return gfc_check_dependency (var, expr, NULL, 0);
|
||||||
|
|
||||||
|
case EXPR_FUNCTION:
|
||||||
|
if (intent != INTENT_IN && expr->inline_noncopying_intrinsic)
|
||||||
|
{
|
||||||
|
expr = gfc_get_noncopying_intrinsic_argument (expr);
|
||||||
|
return gfc_check_argument_var_dependency (var, intent, expr);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Like gfc_check_argument_var_dependency, but extended to any
|
||||||
|
array expression OTHER, not just variables. */
|
||||||
|
|
||||||
|
static int
|
||||||
|
gfc_check_argument_dependency (gfc_expr * other, sym_intent intent,
|
||||||
|
gfc_expr * expr)
|
||||||
|
{
|
||||||
|
switch (other->expr_type)
|
||||||
|
{
|
||||||
|
case EXPR_VARIABLE:
|
||||||
|
return gfc_check_argument_var_dependency (other, intent, expr);
|
||||||
|
|
||||||
|
case EXPR_FUNCTION:
|
||||||
|
if (other->inline_noncopying_intrinsic)
|
||||||
|
{
|
||||||
|
other = gfc_get_noncopying_intrinsic_argument (other);
|
||||||
|
return gfc_check_argument_dependency (other, INTENT_IN, expr);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Like gfc_check_argument_dependency, but check all the arguments in ACTUAL.
|
||||||
|
FNSYM is the function being called, or NULL if not known. */
|
||||||
|
|
||||||
int
|
int
|
||||||
gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
|
gfc_check_fncall_dependency (gfc_expr * other, sym_intent intent,
|
||||||
|
gfc_symbol * fnsym, gfc_actual_arglist * actual)
|
||||||
{
|
{
|
||||||
gfc_actual_arglist *actual;
|
gfc_formal_arglist *formal;
|
||||||
gfc_expr *expr;
|
gfc_expr *expr;
|
||||||
|
|
||||||
gcc_assert (dest->expr_type == EXPR_VARIABLE
|
formal = fnsym ? fnsym->formal : NULL;
|
||||||
&& fncall->expr_type == EXPR_FUNCTION);
|
for (; actual; actual = actual->next, formal = formal ? formal->next : NULL)
|
||||||
gcc_assert (fncall->rank > 0);
|
|
||||||
|
|
||||||
for (actual = fncall->value.function.actual; actual; actual = actual->next)
|
|
||||||
{
|
{
|
||||||
expr = actual->expr;
|
expr = actual->expr;
|
||||||
|
|
||||||
|
|
@ -238,23 +323,14 @@ gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
|
||||||
if (!expr)
|
if (!expr)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Non-variable expressions will be allocated temporaries anyway. */
|
/* Skip intent(in) arguments if OTHER itself is intent(in). */
|
||||||
switch (expr->expr_type)
|
if (formal
|
||||||
{
|
&& intent == INTENT_IN
|
||||||
case EXPR_VARIABLE:
|
&& formal->sym->attr.intent == INTENT_IN)
|
||||||
if (!gfc_ref_needs_temporary_p (expr->ref)
|
continue;
|
||||||
&& gfc_check_dependency (dest, expr, NULL, 0))
|
|
||||||
return 1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case EXPR_ARRAY:
|
if (gfc_check_argument_dependency (other, intent, expr))
|
||||||
if (gfc_check_dependency (dest, expr, NULL, 0))
|
return 1;
|
||||||
return 1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,9 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
||||||
|
|
||||||
|
|
||||||
bool gfc_ref_needs_temporary_p (gfc_ref *);
|
bool gfc_ref_needs_temporary_p (gfc_ref *);
|
||||||
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
|
gfc_expr *gfc_get_noncopying_intrinsic_argument (gfc_expr *);
|
||||||
|
int gfc_check_fncall_dependency (gfc_expr *, sym_intent, gfc_symbol *,
|
||||||
|
gfc_actual_arglist *);
|
||||||
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
||||||
int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
|
int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
|
||||||
int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);
|
int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);
|
||||||
|
|
|
||||||
|
|
@ -1129,6 +1129,9 @@ typedef struct gfc_expr
|
||||||
|
|
||||||
/* True if it is converted from Hollerith constant. */
|
/* True if it is converted from Hollerith constant. */
|
||||||
unsigned int from_H : 1;
|
unsigned int from_H : 1;
|
||||||
|
/* True if the expression is a call to a function that returns an array,
|
||||||
|
and if we have decided not to allocate temporary data for that array. */
|
||||||
|
unsigned int inline_noncopying_intrinsic : 1;
|
||||||
|
|
||||||
union
|
union
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor,Boston, MA
|
||||||
#include "system.h"
|
#include "system.h"
|
||||||
#include "gfortran.h"
|
#include "gfortran.h"
|
||||||
#include "arith.h" /* For gfc_compare_expr(). */
|
#include "arith.h" /* For gfc_compare_expr(). */
|
||||||
|
#include "dependency.h"
|
||||||
|
|
||||||
/* Types used in equivalence statements. */
|
/* Types used in equivalence statements. */
|
||||||
|
|
||||||
|
|
@ -804,6 +805,24 @@ resolve_actual_arglist (gfc_actual_arglist * arg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Go through each actual argument in ACTUAL and see if it can be
|
||||||
|
implemented as an inlined, non-copying intrinsic. FNSYM is the
|
||||||
|
function being called, or NULL if not known. */
|
||||||
|
|
||||||
|
static void
|
||||||
|
find_noncopying_intrinsics (gfc_symbol * fnsym, gfc_actual_arglist * actual)
|
||||||
|
{
|
||||||
|
gfc_actual_arglist *ap;
|
||||||
|
gfc_expr *expr;
|
||||||
|
|
||||||
|
for (ap = actual; ap; ap = ap->next)
|
||||||
|
if (ap->expr
|
||||||
|
&& (expr = gfc_get_noncopying_intrinsic_argument (ap->expr))
|
||||||
|
&& !gfc_check_fncall_dependency (expr, INTENT_IN, fnsym, actual))
|
||||||
|
ap->expr->inline_noncopying_intrinsic = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/************* Function resolution *************/
|
/************* Function resolution *************/
|
||||||
|
|
||||||
/* Resolve a function call known to be generic.
|
/* Resolve a function call known to be generic.
|
||||||
|
|
@ -1150,6 +1169,9 @@ resolve_function (gfc_expr * expr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (t == SUCCESS)
|
||||||
|
find_noncopying_intrinsics (expr->value.function.esym,
|
||||||
|
expr->value.function.actual);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -1372,27 +1394,28 @@ resolve_call (gfc_code * c)
|
||||||
if (resolve_actual_arglist (c->ext.actual) == FAILURE)
|
if (resolve_actual_arglist (c->ext.actual) == FAILURE)
|
||||||
return FAILURE;
|
return FAILURE;
|
||||||
|
|
||||||
if (c->resolved_sym != NULL)
|
t = SUCCESS;
|
||||||
return SUCCESS;
|
if (c->resolved_sym == NULL)
|
||||||
|
switch (procedure_kind (c->symtree->n.sym))
|
||||||
|
{
|
||||||
|
case PTYPE_GENERIC:
|
||||||
|
t = resolve_generic_s (c);
|
||||||
|
break;
|
||||||
|
|
||||||
switch (procedure_kind (c->symtree->n.sym))
|
case PTYPE_SPECIFIC:
|
||||||
{
|
t = resolve_specific_s (c);
|
||||||
case PTYPE_GENERIC:
|
break;
|
||||||
t = resolve_generic_s (c);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case PTYPE_SPECIFIC:
|
case PTYPE_UNKNOWN:
|
||||||
t = resolve_specific_s (c);
|
t = resolve_unknown_s (c);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case PTYPE_UNKNOWN:
|
default:
|
||||||
t = resolve_unknown_s (c);
|
gfc_internal_error ("resolve_subroutine(): bad function type");
|
||||||
break;
|
}
|
||||||
|
|
||||||
default:
|
|
||||||
gfc_internal_error ("resolve_subroutine(): bad function type");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (t == SUCCESS)
|
||||||
|
find_noncopying_intrinsics (c->resolved_sym, c->ext.actual);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -673,6 +673,95 @@ gfc_trans_allocate_temp_array (stmtblock_t * pre, stmtblock_t * post,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Generate code to tranpose array EXPR by creating a new descriptor
|
||||||
|
in which the dimension specifications have been reversed. */
|
||||||
|
|
||||||
|
void
|
||||||
|
gfc_conv_array_transpose (gfc_se * se, gfc_expr * expr)
|
||||||
|
{
|
||||||
|
tree dest, src, dest_index, src_index;
|
||||||
|
gfc_loopinfo *loop;
|
||||||
|
gfc_ss_info *dest_info, *src_info;
|
||||||
|
gfc_ss *dest_ss, *src_ss;
|
||||||
|
gfc_se src_se;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
loop = se->loop;
|
||||||
|
|
||||||
|
src_ss = gfc_walk_expr (expr);
|
||||||
|
dest_ss = se->ss;
|
||||||
|
|
||||||
|
src_info = &src_ss->data.info;
|
||||||
|
dest_info = &dest_ss->data.info;
|
||||||
|
|
||||||
|
/* Get a descriptor for EXPR. */
|
||||||
|
gfc_init_se (&src_se, NULL);
|
||||||
|
gfc_conv_expr_descriptor (&src_se, expr, src_ss);
|
||||||
|
gfc_add_block_to_block (&se->pre, &src_se.pre);
|
||||||
|
gfc_add_block_to_block (&se->post, &src_se.post);
|
||||||
|
src = src_se.expr;
|
||||||
|
|
||||||
|
/* Allocate a new descriptor for the return value. */
|
||||||
|
dest = gfc_create_var (TREE_TYPE (src), "atmp");
|
||||||
|
dest_info->descriptor = dest;
|
||||||
|
se->expr = dest;
|
||||||
|
|
||||||
|
/* Copy across the dtype field. */
|
||||||
|
gfc_add_modify_expr (&se->pre,
|
||||||
|
gfc_conv_descriptor_dtype (dest),
|
||||||
|
gfc_conv_descriptor_dtype (src));
|
||||||
|
|
||||||
|
/* Copy the dimension information, renumbering dimension 1 to 0 and
|
||||||
|
0 to 1. */
|
||||||
|
gcc_assert (dest_info->dimen == 2);
|
||||||
|
gcc_assert (src_info->dimen == 2);
|
||||||
|
for (n = 0; n < 2; n++)
|
||||||
|
{
|
||||||
|
dest_info->delta[n] = integer_zero_node;
|
||||||
|
dest_info->start[n] = integer_zero_node;
|
||||||
|
dest_info->stride[n] = integer_one_node;
|
||||||
|
dest_info->dim[n] = n;
|
||||||
|
|
||||||
|
dest_index = gfc_rank_cst[n];
|
||||||
|
src_index = gfc_rank_cst[1 - n];
|
||||||
|
|
||||||
|
gfc_add_modify_expr (&se->pre,
|
||||||
|
gfc_conv_descriptor_stride (dest, dest_index),
|
||||||
|
gfc_conv_descriptor_stride (src, src_index));
|
||||||
|
|
||||||
|
gfc_add_modify_expr (&se->pre,
|
||||||
|
gfc_conv_descriptor_lbound (dest, dest_index),
|
||||||
|
gfc_conv_descriptor_lbound (src, src_index));
|
||||||
|
|
||||||
|
gfc_add_modify_expr (&se->pre,
|
||||||
|
gfc_conv_descriptor_ubound (dest, dest_index),
|
||||||
|
gfc_conv_descriptor_ubound (src, src_index));
|
||||||
|
|
||||||
|
if (!loop->to[n])
|
||||||
|
{
|
||||||
|
gcc_assert (integer_zerop (loop->from[n]));
|
||||||
|
loop->to[n] = build2 (MINUS_EXPR, gfc_array_index_type,
|
||||||
|
gfc_conv_descriptor_ubound (dest, dest_index),
|
||||||
|
gfc_conv_descriptor_lbound (dest, dest_index));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy the data pointer. */
|
||||||
|
dest_info->data = gfc_conv_descriptor_data_get (src);
|
||||||
|
gfc_conv_descriptor_data_set (&se->pre, dest, dest_info->data);
|
||||||
|
|
||||||
|
/* Copy the offset. This is not changed by transposition: the top-left
|
||||||
|
element is still at the same offset as before. */
|
||||||
|
dest_info->offset = gfc_conv_descriptor_offset (src);
|
||||||
|
gfc_add_modify_expr (&se->pre,
|
||||||
|
gfc_conv_descriptor_offset (dest),
|
||||||
|
dest_info->offset);
|
||||||
|
|
||||||
|
if (dest_info->dimen > loop->temp_dim)
|
||||||
|
loop->temp_dim = dest_info->dimen;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Return the number of iterations in a loop that starts at START,
|
/* Return the number of iterations in a loop that starts at START,
|
||||||
ends at END, and has step STEP. */
|
ends at END, and has step STEP. */
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -91,6 +91,8 @@ void gfc_conv_tmp_ref (gfc_se *);
|
||||||
void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
|
void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
|
||||||
/* Convert an array for passing as an actual function parameter. */
|
/* Convert an array for passing as an actual function parameter. */
|
||||||
void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
|
void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
|
||||||
|
/* Evaluate and transpose a matrix expression. */
|
||||||
|
void gfc_conv_array_transpose (gfc_se *, gfc_expr *);
|
||||||
|
|
||||||
/* These work with both descriptors and descriptorless arrays. */
|
/* These work with both descriptors and descriptorless arrays. */
|
||||||
tree gfc_conv_array_data (tree);
|
tree gfc_conv_array_data (tree);
|
||||||
|
|
@ -112,8 +114,6 @@ tree gfc_conv_descriptor_ubound (tree, tree);
|
||||||
|
|
||||||
/* Dependency checking for WHERE and FORALL. */
|
/* Dependency checking for WHERE and FORALL. */
|
||||||
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
|
||||||
/* Dependency checking for function calls. */
|
|
||||||
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
|
|
||||||
|
|
||||||
/* Add pre-loop scalarization code for intrinsic functions which require
|
/* Add pre-loop scalarization code for intrinsic functions which require
|
||||||
special handling. */
|
special handling. */
|
||||||
|
|
|
||||||
|
|
@ -2627,7 +2627,9 @@ gfc_trans_arrayfunc_assign (gfc_expr * expr1, gfc_expr * expr2)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check for a dependency. */
|
/* Check for a dependency. */
|
||||||
if (gfc_check_fncall_dependency (expr1, expr2))
|
if (gfc_check_fncall_dependency (expr1, INTENT_OUT,
|
||||||
|
expr2->value.function.esym,
|
||||||
|
expr2->value.function.actual))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
/* The frontend doesn't seem to bother filling in expr->symtree for intrinsic
|
/* The frontend doesn't seem to bother filling in expr->symtree for intrinsic
|
||||||
|
|
|
||||||
|
|
@ -2894,7 +2894,7 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
|
||||||
|
|
||||||
name = &expr->value.function.name[2];
|
name = &expr->value.function.name[2];
|
||||||
|
|
||||||
if (expr->rank > 0)
|
if (expr->rank > 0 && !expr->inline_noncopying_intrinsic)
|
||||||
{
|
{
|
||||||
lib = gfc_is_intrinsic_libcall (expr);
|
lib = gfc_is_intrinsic_libcall (expr);
|
||||||
if (lib != 0)
|
if (lib != 0)
|
||||||
|
|
@ -3119,6 +3119,16 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
|
||||||
gfc_conv_intrinsic_bound (se, expr, 0);
|
gfc_conv_intrinsic_bound (se, expr, 0);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case GFC_ISYM_TRANSPOSE:
|
||||||
|
if (se->ss && se->ss->useflags)
|
||||||
|
{
|
||||||
|
gfc_conv_tmp_array_ref (se);
|
||||||
|
gfc_advance_se_ss_chain (se);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
gfc_conv_array_transpose (se, expr->value.function.actual->expr);
|
||||||
|
break;
|
||||||
|
|
||||||
case GFC_ISYM_LEN:
|
case GFC_ISYM_LEN:
|
||||||
gfc_conv_intrinsic_len (se, expr);
|
gfc_conv_intrinsic_len (se, expr);
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,13 @@
|
||||||
|
2005-12-13 Richard Sandiford <richard@codesourcery.com>
|
||||||
|
Victor Leikehman <LEI@il.ibm.com>
|
||||||
|
|
||||||
|
* m4/matmul.m4: Use a different order in the special case of a
|
||||||
|
transposed first argument.
|
||||||
|
* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
|
||||||
|
* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
|
||||||
|
* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
|
||||||
|
* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.
|
||||||
|
|
||||||
2005-12-10 Janne Blomqvist <jb@gcc.gnu.org>
|
2005-12-10 Janne Blomqvist <jb@gcc.gnu.org>
|
||||||
|
|
||||||
* Makefile.am: Enable loop unrolling for matmul.
|
* Makefile.am: Enable loop unrolling for matmul.
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_10)
|
#if defined (HAVE_GFC_COMPLEX_10)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_10 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_10 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_10 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_10 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_10) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_10 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_10 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_10 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_10 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_10) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_16)
|
#if defined (HAVE_GFC_COMPLEX_16)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_16 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_16 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_16 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_16 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_16 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_16 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_4)
|
#if defined (HAVE_GFC_COMPLEX_4)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_4 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_4 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_4 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_4 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_4 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_4 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_8)
|
#if defined (HAVE_GFC_COMPLEX_8)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_8 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_8 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_8 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_COMPLEX_8 *restrict abase_x;
|
||||||
|
const GFC_COMPLEX_8 *restrict bbase_y;
|
||||||
|
GFC_COMPLEX_8 *restrict dest_y;
|
||||||
|
GFC_COMPLEX_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_COMPLEX_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_16)
|
#if defined (HAVE_GFC_INTEGER_16)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_16 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_16 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_16 *restrict dest_y;
|
||||||
|
GFC_INTEGER_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_16 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_16 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_16 *restrict dest_y;
|
||||||
|
GFC_INTEGER_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_4)
|
#if defined (HAVE_GFC_INTEGER_4)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_4 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_4 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_4 *restrict dest_y;
|
||||||
|
GFC_INTEGER_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_4 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_4 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_4 *restrict dest_y;
|
||||||
|
GFC_INTEGER_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_8)
|
#if defined (HAVE_GFC_INTEGER_8)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_8 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_8 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_8 *restrict dest_y;
|
||||||
|
GFC_INTEGER_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_INTEGER_8 *restrict abase_x;
|
||||||
|
const GFC_INTEGER_8 *restrict bbase_y;
|
||||||
|
GFC_INTEGER_8 *restrict dest_y;
|
||||||
|
GFC_INTEGER_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_INTEGER_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_10)
|
#if defined (HAVE_GFC_REAL_10)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_REAL_10 *restrict abase_x;
|
||||||
|
const GFC_REAL_10 *restrict bbase_y;
|
||||||
|
GFC_REAL_10 *restrict dest_y;
|
||||||
|
GFC_REAL_10 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_10) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_REAL_10 *restrict abase_x;
|
||||||
|
const GFC_REAL_10 *restrict bbase_y;
|
||||||
|
GFC_REAL_10 *restrict dest_y;
|
||||||
|
GFC_REAL_10 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_10) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_16)
|
#if defined (HAVE_GFC_REAL_16)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_REAL_16 *restrict abase_x;
|
||||||
|
const GFC_REAL_16 *restrict bbase_y;
|
||||||
|
GFC_REAL_16 *restrict dest_y;
|
||||||
|
GFC_REAL_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_REAL_16 *restrict abase_x;
|
||||||
|
const GFC_REAL_16 *restrict bbase_y;
|
||||||
|
GFC_REAL_16 *restrict dest_y;
|
||||||
|
GFC_REAL_16 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_16) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_4)
|
#if defined (HAVE_GFC_REAL_4)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_REAL_4 *restrict abase_x;
|
||||||
|
const GFC_REAL_4 *restrict bbase_y;
|
||||||
|
GFC_REAL_4 *restrict dest_y;
|
||||||
|
GFC_REAL_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_REAL_4 *restrict abase_x;
|
||||||
|
const GFC_REAL_4 *restrict bbase_y;
|
||||||
|
GFC_REAL_4 *restrict dest_y;
|
||||||
|
GFC_REAL_4 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_4) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_8)
|
#if defined (HAVE_GFC_REAL_8)
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
|
|
@ -204,7 +217,28 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const GFC_REAL_8 *restrict abase_x;
|
||||||
|
const GFC_REAL_8 *restrict bbase_y;
|
||||||
|
GFC_REAL_8 *restrict dest_y;
|
||||||
|
GFC_REAL_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -216,6 +250,27 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const GFC_REAL_8 *restrict abase_x;
|
||||||
|
const GFC_REAL_8 *restrict bbase_y;
|
||||||
|
GFC_REAL_8 *restrict dest_y;
|
||||||
|
GFC_REAL_8 s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (GFC_REAL_8) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
|
|
@ -37,16 +37,29 @@ include(iparm.m4)dnl
|
||||||
|
|
||||||
`#if defined (HAVE_'rtype_name`)'
|
`#if defined (HAVE_'rtype_name`)'
|
||||||
|
|
||||||
/* This is a C version of the following fortran pseudo-code. The key
|
/* The order of loops is different in the case of plain matrix
|
||||||
point is the loop order -- we access all arrays column-first, which
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
improves the performance enough to boost galgel spec score by 50%.
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
|
||||||
|
looking at their strides.
|
||||||
|
|
||||||
|
The equivalent Fortran pseudo-code is:
|
||||||
|
|
||||||
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
|
||||||
C = 0
|
IF (.NOT.IS_TRANSPOSED(A)) THEN
|
||||||
DO J=1,N
|
C = 0
|
||||||
DO K=1,COUNT
|
DO J=1,N
|
||||||
|
DO K=1,COUNT
|
||||||
|
DO I=1,M
|
||||||
|
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
||||||
|
ELSE
|
||||||
|
DO J=1,N
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
C(I,J) = C(I,J)+A(I,K)*B(K,J)
|
S = 0
|
||||||
|
DO K=1,COUNT
|
||||||
|
S = S+A(I,K)+B(K,J)
|
||||||
|
C(I,J) = S
|
||||||
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern void matmul_`'rtype_code (rtype * const restrict retarray,
|
extern void matmul_`'rtype_code (rtype * const restrict retarray,
|
||||||
|
|
@ -206,7 +219,28 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
|
||||||
|
{
|
||||||
|
const rtype_name *restrict abase_x;
|
||||||
|
const rtype_name *restrict bbase_y;
|
||||||
|
rtype_name *restrict dest_y;
|
||||||
|
rtype_name s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (rtype_name) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n] * bbase_y[n];
|
||||||
|
dest_y[x] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (axstride < aystride)
|
||||||
{
|
{
|
||||||
for (y = 0; y < ycount; y++)
|
for (y = 0; y < ycount; y++)
|
||||||
for (x = 0; x < xcount; x++)
|
for (x = 0; x < xcount; x++)
|
||||||
|
|
@ -218,6 +252,27 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||||
/* dest[x,y] += a[x,n] * b[n,y] */
|
/* dest[x,y] += a[x,n] * b[n,y] */
|
||||||
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
const rtype_name *restrict abase_x;
|
||||||
|
const rtype_name *restrict bbase_y;
|
||||||
|
rtype_name *restrict dest_y;
|
||||||
|
rtype_name s;
|
||||||
|
|
||||||
|
for (y = 0; y < ycount; y++)
|
||||||
|
{
|
||||||
|
bbase_y = &bbase[y*bystride];
|
||||||
|
dest_y = &dest[y*rystride];
|
||||||
|
for (x = 0; x < xcount; x++)
|
||||||
|
{
|
||||||
|
abase_x = &abase[x*axstride];
|
||||||
|
s = (rtype_name) 0;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
s += abase_x[n*aystride] * bbase_y[n*bxstride];
|
||||||
|
dest_y[x*rxstride] = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue