mirror of git://gcc.gnu.org/git/gcc.git
re PR fortran/26025 (Optionally use BLAS for matmul)
PR fortran/26025 * lang.opt: Add -fexternal-blas and -fblas-matmul-limit options. * options.c (gfc_init_options): Initialize new flags. (gfc_handle_option): Handle new flags. * gfortran.h (gfc_option): Add flag_external_blas and blas_matmul_limit flags. * trans-expr.c (gfc_conv_function_call): Use new argument append_args, appending it at the end of the argument list built for a function call. * trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new append_args argument to gfc_trans_call. * trans.h (gfc_conv_function_call): Update prototype. * trans-decl.c (gfc_build_intrinsic_function_decls): Add prototypes for BLAS ?gemm routines. * trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the extra arguments given to the library matmul function, and give them to gfc_conv_function_call. * invoke.texi: Add documentation for -fexternal-blas and -fblas-matmul-limit. * m4/matmul.m4: Add possible call to gemm routine. * generated/matmul_r8.c: Regenerate. * generated/matmul_r16.c: Regenerate. * generated/matmul_c8.c: Regenerate. * generated/matmul_i8.c: Regenerate. * generated/matmul_c16.c: Regenerate. * generated/matmul_r10.c: Regenerate. * generated/matmul_r4.c: Regenerate. * generated/matmul_c10.c: Regenerate. * generated/matmul_c4.c: Regenerate. * generated/matmul_i4.c: Regenerate. * generated/matmul_i16.c: Regenerate. From-SVN: r117948
This commit is contained in:
parent
d4f6a8099d
commit
5a0aad3165
|
|
@ -1,3 +1,25 @@
|
||||||
|
2006-10-22 Francois-Xavier Coudert <coudert@clipper.ens.fr>
|
||||||
|
|
||||||
|
PR fortran/26025
|
||||||
|
* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
|
||||||
|
* options.c (gfc_init_options): Initialize new flags.
|
||||||
|
(gfc_handle_option): Handle new flags.
|
||||||
|
* gfortran.h (gfc_option): Add flag_external_blas and
|
||||||
|
blas_matmul_limit flags.
|
||||||
|
* trans-expr.c (gfc_conv_function_call): Use new argument
|
||||||
|
append_args, appending it at the end of the argument list
|
||||||
|
built for a function call.
|
||||||
|
* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
|
||||||
|
append_args argument to gfc_trans_call.
|
||||||
|
* trans.h (gfc_conv_function_call): Update prototype.
|
||||||
|
* trans-decl.c (gfc_build_intrinsic_function_decls): Add
|
||||||
|
prototypes for BLAS ?gemm routines.
|
||||||
|
* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
|
||||||
|
extra arguments given to the library matmul function, and give
|
||||||
|
them to gfc_conv_function_call.
|
||||||
|
* invoke.texi: Add documentation for -fexternal-blas and
|
||||||
|
-fblas-matmul-limit.
|
||||||
|
|
||||||
2006-10-21 Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
|
2006-10-21 Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
|
||||||
|
|
||||||
* Make-lang.in (F95_LIBS): Delete.
|
* Make-lang.in (F95_LIBS): Delete.
|
||||||
|
|
|
||||||
|
|
@ -1652,6 +1652,8 @@ typedef struct
|
||||||
int flag_f2c;
|
int flag_f2c;
|
||||||
int flag_automatic;
|
int flag_automatic;
|
||||||
int flag_backslash;
|
int flag_backslash;
|
||||||
|
int flag_external_blas;
|
||||||
|
int blas_matmul_limit;
|
||||||
int flag_cray_pointer;
|
int flag_cray_pointer;
|
||||||
int flag_d_lines;
|
int flag_d_lines;
|
||||||
int flag_openmp;
|
int flag_openmp;
|
||||||
|
|
|
||||||
|
|
@ -152,7 +152,8 @@ by type. Explanations are in the following sections.
|
||||||
@gccoptlist{
|
@gccoptlist{
|
||||||
-fno-automatic -ff2c -fno-underscoring -fsecond-underscore @gol
|
-fno-automatic -ff2c -fno-underscoring -fsecond-underscore @gol
|
||||||
-fbounds-check -fmax-stack-var-size=@var{n} @gol
|
-fbounds-check -fmax-stack-var-size=@var{n} @gol
|
||||||
-fpack-derived -frepack-arrays -fshort-enums}
|
-fpack-derived -frepack-arrays -fshort-enums -fexternal-blas
|
||||||
|
-fblas-matmul-limit=@var{n}}
|
||||||
@end table
|
@end table
|
||||||
|
|
||||||
@menu
|
@menu
|
||||||
|
|
@ -859,6 +860,27 @@ This option is provided for interoperability with C code that was
|
||||||
compiled with the @command{-fshort-enums} option. It will make
|
compiled with the @command{-fshort-enums} option. It will make
|
||||||
GNU Fortran choose the smallest @code{INTEGER} kind a given
|
GNU Fortran choose the smallest @code{INTEGER} kind a given
|
||||||
enumerator set will fit in, and give all its enumerators this kind.
|
enumerator set will fit in, and give all its enumerators this kind.
|
||||||
|
|
||||||
|
@cindex -fexternal-blas
|
||||||
|
@item -fexternal-blas
|
||||||
|
This option will make gfortran generate calls to BLAS functions for some
|
||||||
|
matrix operations like @code{MATMUL}, instead of using our own
|
||||||
|
algorithms, if the size of the matrices involved is larger than a given
|
||||||
|
limit (see @command{-fblas-matmul-limit}). This may be profitable if an
|
||||||
|
optimized vendor BLAS library is available. The BLAS library will have
|
||||||
|
to be specified at link time.
|
||||||
|
|
||||||
|
@cindex -fblas-matmul-limit
|
||||||
|
@item -fblas-matmul-limit=@var{n}
|
||||||
|
Only significant when @command{-fexternal-blas} is in effect.
|
||||||
|
Matrix multiplication of matrices with size larger than (or equal to) @var{n}
|
||||||
|
will be performed by calls to BLAS functions, while others will be
|
||||||
|
handled by @command{gfortran} internal algorithms. If the matrices
|
||||||
|
involved are not square, the size comparison is performed using the
|
||||||
|
geometric mean of the dimensions of the argument and result matrices.
|
||||||
|
|
||||||
|
The default value for @var{n} is 30.
|
||||||
|
|
||||||
@end table
|
@end table
|
||||||
|
|
||||||
@xref{Code Gen Options,,Options for Code Generation Conventions,
|
@xref{Code Gen Options,,Options for Code Generation Conventions,
|
||||||
|
|
|
||||||
|
|
@ -85,6 +85,14 @@ fbackslash
|
||||||
Fortran
|
Fortran
|
||||||
Specify that backslash in string introduces an escape character
|
Specify that backslash in string introduces an escape character
|
||||||
|
|
||||||
|
fexternal-blas
|
||||||
|
Fortran
|
||||||
|
Specify that an external BLAS library should be used for matmul calls on large-size arrays
|
||||||
|
|
||||||
|
fblas-matmul-limit=
|
||||||
|
Fortran RejectNegative Joined UInteger
|
||||||
|
-fblas-matmul-limit=<n> Size of the smallest matrix for which matmul will use BLAS
|
||||||
|
|
||||||
fdefault-double-8
|
fdefault-double-8
|
||||||
Fortran
|
Fortran
|
||||||
Set the default double precision kind to an 8 byte wide type
|
Set the default double precision kind to an 8 byte wide type
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,8 @@ gfc_init_options (unsigned int argc ATTRIBUTE_UNUSED,
|
||||||
gfc_option.flag_preprocessed = 0;
|
gfc_option.flag_preprocessed = 0;
|
||||||
gfc_option.flag_automatic = 1;
|
gfc_option.flag_automatic = 1;
|
||||||
gfc_option.flag_backslash = 1;
|
gfc_option.flag_backslash = 1;
|
||||||
|
gfc_option.flag_external_blas = 0;
|
||||||
|
gfc_option.blas_matmul_limit = 30;
|
||||||
gfc_option.flag_cray_pointer = 0;
|
gfc_option.flag_cray_pointer = 0;
|
||||||
gfc_option.flag_d_lines = -1;
|
gfc_option.flag_d_lines = -1;
|
||||||
gfc_option.flag_openmp = 0;
|
gfc_option.flag_openmp = 0;
|
||||||
|
|
@ -450,6 +452,14 @@ gfc_handle_option (size_t scode, const char *arg, int value)
|
||||||
gfc_option.flag_dollar_ok = value;
|
gfc_option.flag_dollar_ok = value;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case OPT_fexternal_blas:
|
||||||
|
gfc_option.flag_external_blas = value;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case OPT_fblas_matmul_limit_:
|
||||||
|
gfc_option.blas_matmul_limit = value;
|
||||||
|
break;
|
||||||
|
|
||||||
case OPT_fd_lines_as_code:
|
case OPT_fd_lines_as_code:
|
||||||
gfc_option.flag_d_lines = 1;
|
gfc_option.flag_d_lines = 1;
|
||||||
break;
|
break;
|
||||||
|
|
|
||||||
|
|
@ -143,6 +143,12 @@ tree gfor_fndecl_iargc;
|
||||||
tree gfor_fndecl_si_kind;
|
tree gfor_fndecl_si_kind;
|
||||||
tree gfor_fndecl_sr_kind;
|
tree gfor_fndecl_sr_kind;
|
||||||
|
|
||||||
|
/* BLAS gemm functions. */
|
||||||
|
tree gfor_fndecl_sgemm;
|
||||||
|
tree gfor_fndecl_dgemm;
|
||||||
|
tree gfor_fndecl_cgemm;
|
||||||
|
tree gfor_fndecl_zgemm;
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
gfc_add_decl_to_parent_function (tree decl)
|
gfc_add_decl_to_parent_function (tree decl)
|
||||||
|
|
@ -2186,6 +2192,49 @@ gfc_build_intrinsic_function_decls (void)
|
||||||
gfc_int4_type_node, 1,
|
gfc_int4_type_node, 1,
|
||||||
gfc_real16_type_node);
|
gfc_real16_type_node);
|
||||||
|
|
||||||
|
/* BLAS functions. */
|
||||||
|
{
|
||||||
|
tree pint = build_pointer_type (gfc_c_int_type_node);
|
||||||
|
tree ps = build_pointer_type (gfc_get_real_type (gfc_default_real_kind));
|
||||||
|
tree pd = build_pointer_type (gfc_get_real_type (gfc_default_double_kind));
|
||||||
|
tree pc = build_pointer_type (gfc_get_complex_type (gfc_default_real_kind));
|
||||||
|
tree pz = build_pointer_type
|
||||||
|
(gfc_get_complex_type (gfc_default_double_kind));
|
||||||
|
|
||||||
|
gfor_fndecl_sgemm = gfc_build_library_function_decl
|
||||||
|
(get_identifier
|
||||||
|
(gfc_option.flag_underscoring ? "sgemm_"
|
||||||
|
: "sgemm"),
|
||||||
|
void_type_node, 15, pchar_type_node,
|
||||||
|
pchar_type_node, pint, pint, pint, ps, ps, pint,
|
||||||
|
ps, pint, ps, ps, pint, gfc_c_int_type_node,
|
||||||
|
gfc_c_int_type_node);
|
||||||
|
gfor_fndecl_dgemm = gfc_build_library_function_decl
|
||||||
|
(get_identifier
|
||||||
|
(gfc_option.flag_underscoring ? "dgemm_"
|
||||||
|
: "dgemm"),
|
||||||
|
void_type_node, 15, pchar_type_node,
|
||||||
|
pchar_type_node, pint, pint, pint, pd, pd, pint,
|
||||||
|
pd, pint, pd, pd, pint, gfc_c_int_type_node,
|
||||||
|
gfc_c_int_type_node);
|
||||||
|
gfor_fndecl_cgemm = gfc_build_library_function_decl
|
||||||
|
(get_identifier
|
||||||
|
(gfc_option.flag_underscoring ? "cgemm_"
|
||||||
|
: "cgemm"),
|
||||||
|
void_type_node, 15, pchar_type_node,
|
||||||
|
pchar_type_node, pint, pint, pint, pc, pc, pint,
|
||||||
|
pc, pint, pc, pc, pint, gfc_c_int_type_node,
|
||||||
|
gfc_c_int_type_node);
|
||||||
|
gfor_fndecl_zgemm = gfc_build_library_function_decl
|
||||||
|
(get_identifier
|
||||||
|
(gfc_option.flag_underscoring ? "zgemm_"
|
||||||
|
: "zgemm"),
|
||||||
|
void_type_node, 15, pchar_type_node,
|
||||||
|
pchar_type_node, pint, pint, pint, pz, pz, pint,
|
||||||
|
pz, pint, pz, pz, pint, gfc_c_int_type_node,
|
||||||
|
gfc_c_int_type_node);
|
||||||
|
}
|
||||||
|
|
||||||
/* Other functions. */
|
/* Other functions. */
|
||||||
gfor_fndecl_size0 =
|
gfor_fndecl_size0 =
|
||||||
gfc_build_library_function_decl (get_identifier (PREFIX("size0")),
|
gfc_build_library_function_decl (get_identifier (PREFIX("size0")),
|
||||||
|
|
|
||||||
|
|
@ -1853,7 +1853,7 @@ is_aliased_array (gfc_expr * e)
|
||||||
|
|
||||||
int
|
int
|
||||||
gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
|
gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
|
||||||
gfc_actual_arglist * arg)
|
gfc_actual_arglist * arg, tree append_args)
|
||||||
{
|
{
|
||||||
gfc_interface_mapping mapping;
|
gfc_interface_mapping mapping;
|
||||||
tree arglist;
|
tree arglist;
|
||||||
|
|
@ -2226,6 +2226,11 @@ gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
|
||||||
/* Add the hidden string length parameters to the arguments. */
|
/* Add the hidden string length parameters to the arguments. */
|
||||||
arglist = chainon (arglist, stringargs);
|
arglist = chainon (arglist, stringargs);
|
||||||
|
|
||||||
|
/* We may want to append extra arguments here. This is used e.g. for
|
||||||
|
calls to libgfortran_matmul_??, which need extra information. */
|
||||||
|
if (append_args != NULL_TREE)
|
||||||
|
arglist = chainon (arglist, append_args);
|
||||||
|
|
||||||
/* Generate the actual call. */
|
/* Generate the actual call. */
|
||||||
gfc_conv_function_val (se, sym);
|
gfc_conv_function_val (se, sym);
|
||||||
/* If there are alternate return labels, function type should be
|
/* If there are alternate return labels, function type should be
|
||||||
|
|
@ -2545,7 +2550,7 @@ gfc_conv_function_expr (gfc_se * se, gfc_expr * expr)
|
||||||
sym = expr->value.function.esym;
|
sym = expr->value.function.esym;
|
||||||
if (!sym)
|
if (!sym)
|
||||||
sym = expr->symtree->n.sym;
|
sym = expr->symtree->n.sym;
|
||||||
gfc_conv_function_call (se, sym, expr->value.function.actual);
|
gfc_conv_function_call (se, sym, expr->value.function.actual, NULL_TREE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1378,6 +1378,7 @@ static void
|
||||||
gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
|
gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
|
||||||
{
|
{
|
||||||
gfc_symbol *sym;
|
gfc_symbol *sym;
|
||||||
|
tree append_args;
|
||||||
|
|
||||||
gcc_assert (!se->ss || se->ss->expr == expr);
|
gcc_assert (!se->ss || se->ss->expr == expr);
|
||||||
|
|
||||||
|
|
@ -1387,7 +1388,54 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
|
||||||
gcc_assert (expr->rank == 0);
|
gcc_assert (expr->rank == 0);
|
||||||
|
|
||||||
sym = gfc_get_symbol_for_expr (expr);
|
sym = gfc_get_symbol_for_expr (expr);
|
||||||
gfc_conv_function_call (se, sym, expr->value.function.actual);
|
|
||||||
|
/* Calls to libgfortran_matmul need to be appended special arguments,
|
||||||
|
to be able to call the BLAS ?gemm functions if required and possible. */
|
||||||
|
append_args = NULL_TREE;
|
||||||
|
if (expr->value.function.isym->generic_id == GFC_ISYM_MATMUL
|
||||||
|
&& sym->ts.type != BT_LOGICAL)
|
||||||
|
{
|
||||||
|
tree cint = gfc_get_int_type (gfc_c_int_kind);
|
||||||
|
|
||||||
|
if (gfc_option.flag_external_blas
|
||||||
|
&& (sym->ts.type == BT_REAL || sym->ts.type == BT_COMPLEX)
|
||||||
|
&& (sym->ts.kind == gfc_default_real_kind
|
||||||
|
|| sym->ts.kind == gfc_default_double_kind))
|
||||||
|
{
|
||||||
|
tree gemm_fndecl;
|
||||||
|
|
||||||
|
if (sym->ts.type == BT_REAL)
|
||||||
|
{
|
||||||
|
if (sym->ts.kind == gfc_default_real_kind)
|
||||||
|
gemm_fndecl = gfor_fndecl_sgemm;
|
||||||
|
else
|
||||||
|
gemm_fndecl = gfor_fndecl_dgemm;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (sym->ts.kind == gfc_default_real_kind)
|
||||||
|
gemm_fndecl = gfor_fndecl_cgemm;
|
||||||
|
else
|
||||||
|
gemm_fndecl = gfor_fndecl_zgemm;
|
||||||
|
}
|
||||||
|
|
||||||
|
append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 1));
|
||||||
|
append_args = gfc_chainon_list
|
||||||
|
(append_args, build_int_cst
|
||||||
|
(cint, gfc_option.blas_matmul_limit));
|
||||||
|
append_args = gfc_chainon_list (append_args,
|
||||||
|
gfc_build_addr_expr (NULL_TREE,
|
||||||
|
gemm_fndecl));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 0));
|
||||||
|
append_args = gfc_chainon_list (append_args, build_int_cst (cint, 0));
|
||||||
|
append_args = gfc_chainon_list (append_args, null_pointer_node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
gfc_conv_function_call (se, sym, expr->value.function.actual, append_args);
|
||||||
gfc_free (sym);
|
gfc_free (sym);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -334,7 +334,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
|
||||||
|
|
||||||
/* Translate the call. */
|
/* Translate the call. */
|
||||||
has_alternate_specifier
|
has_alternate_specifier
|
||||||
= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual);
|
= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual,
|
||||||
|
NULL_TREE);
|
||||||
|
|
||||||
/* A subroutine without side-effect, by definition, does nothing! */
|
/* A subroutine without side-effect, by definition, does nothing! */
|
||||||
TREE_SIDE_EFFECTS (se.expr) = 1;
|
TREE_SIDE_EFFECTS (se.expr) = 1;
|
||||||
|
|
@ -399,7 +400,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
|
||||||
gfc_init_block (&block);
|
gfc_init_block (&block);
|
||||||
|
|
||||||
/* Add the subroutine call to the block. */
|
/* Add the subroutine call to the block. */
|
||||||
gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual);
|
gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual,
|
||||||
|
NULL_TREE);
|
||||||
gfc_add_expr_to_block (&loopse.pre, loopse.expr);
|
gfc_add_expr_to_block (&loopse.pre, loopse.expr);
|
||||||
|
|
||||||
gfc_add_block_to_block (&block, &loopse.pre);
|
gfc_add_block_to_block (&block, &loopse.pre);
|
||||||
|
|
|
||||||
|
|
@ -303,7 +303,8 @@ void gfc_conv_intrinsic_function (gfc_se *, gfc_expr *);
|
||||||
int gfc_is_intrinsic_libcall (gfc_expr *);
|
int gfc_is_intrinsic_libcall (gfc_expr *);
|
||||||
|
|
||||||
/* Also used to CALL subroutines. */
|
/* Also used to CALL subroutines. */
|
||||||
int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *);
|
int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *,
|
||||||
|
tree);
|
||||||
/* gfc_trans_* shouldn't call push/poplevel, use gfc_push/pop_scope */
|
/* gfc_trans_* shouldn't call push/poplevel, use gfc_push/pop_scope */
|
||||||
|
|
||||||
/* Generate code for a scalar assignment. */
|
/* Generate code for a scalar assignment. */
|
||||||
|
|
@ -507,6 +508,12 @@ extern GTY(()) tree gfor_fndecl_math_exponent8;
|
||||||
extern GTY(()) tree gfor_fndecl_math_exponent10;
|
extern GTY(()) tree gfor_fndecl_math_exponent10;
|
||||||
extern GTY(()) tree gfor_fndecl_math_exponent16;
|
extern GTY(()) tree gfor_fndecl_math_exponent16;
|
||||||
|
|
||||||
|
/* BLAS functions. */
|
||||||
|
extern GTY(()) tree gfor_fndecl_sgemm;
|
||||||
|
extern GTY(()) tree gfor_fndecl_dgemm;
|
||||||
|
extern GTY(()) tree gfor_fndecl_cgemm;
|
||||||
|
extern GTY(()) tree gfor_fndecl_zgemm;
|
||||||
|
|
||||||
/* String functions. */
|
/* String functions. */
|
||||||
extern GTY(()) tree gfor_fndecl_compare_string;
|
extern GTY(()) tree gfor_fndecl_compare_string;
|
||||||
extern GTY(()) tree gfor_fndecl_concat_string;
|
extern GTY(()) tree gfor_fndecl_concat_string;
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,19 @@
|
||||||
|
2006-10-22 Francois-Xavier Coudert <coudert@clipper.ens.fr>
|
||||||
|
|
||||||
|
PR fortran/26025
|
||||||
|
* m4/matmul.m4: Add possible call to gemm routine.
|
||||||
|
* generated/matmul_r8.c: Regenerate.
|
||||||
|
* generated/matmul_r16.c: Regenerate.
|
||||||
|
* generated/matmul_c8.c: Regenerate.
|
||||||
|
* generated/matmul_i8.c: Regenerate.
|
||||||
|
* generated/matmul_c16.c: Regenerate.
|
||||||
|
* generated/matmul_r10.c: Regenerate.
|
||||||
|
* generated/matmul_r4.c: Regenerate.
|
||||||
|
* generated/matmul_c10.c: Regenerate.
|
||||||
|
* generated/matmul_c4.c: Regenerate.
|
||||||
|
* generated/matmul_i4.c: Regenerate.
|
||||||
|
* generated/matmul_i16.c: Regenerate.
|
||||||
|
|
||||||
2006-10-21 Steven G. Kargl <kargl@gcc.gnu.org>
|
2006-10-21 Steven G. Kargl <kargl@gcc.gnu.org>
|
||||||
|
|
||||||
* runtime/error.c: Add errno.h
|
* runtime/error.c: Add errno.h
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_10)
|
#if defined (HAVE_GFC_COMPLEX_10)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
|
||||||
|
const int *, const GFC_COMPLEX_10 *, const int *,
|
||||||
|
const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b);
|
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_c10);
|
export_proto(matmul_c10);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_c10 (gfc_array_c10 * const restrict retarray,
|
matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b)
|
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_10 * restrict abase;
|
const GFC_COMPLEX_10 * restrict abase;
|
||||||
const GFC_COMPLEX_10 * restrict bbase;
|
const GFC_COMPLEX_10 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_COMPLEX_10 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_10 * restrict bbase_y;
|
const GFC_COMPLEX_10 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_16)
|
#if defined (HAVE_GFC_COMPLEX_16)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
|
||||||
|
const int *, const GFC_COMPLEX_16 *, const int *,
|
||||||
|
const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b);
|
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_c16);
|
export_proto(matmul_c16);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_c16 (gfc_array_c16 * const restrict retarray,
|
matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b)
|
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_16 * restrict abase;
|
const GFC_COMPLEX_16 * restrict abase;
|
||||||
const GFC_COMPLEX_16 * restrict bbase;
|
const GFC_COMPLEX_16 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_COMPLEX_16 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_16 * restrict bbase_y;
|
const GFC_COMPLEX_16 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_4)
|
#if defined (HAVE_GFC_COMPLEX_4)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
|
||||||
|
const int *, const GFC_COMPLEX_4 *, const int *,
|
||||||
|
const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b);
|
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_c4);
|
export_proto(matmul_c4);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_c4 (gfc_array_c4 * const restrict retarray,
|
matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b)
|
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_4 * restrict abase;
|
const GFC_COMPLEX_4 * restrict abase;
|
||||||
const GFC_COMPLEX_4 * restrict bbase;
|
const GFC_COMPLEX_4 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_COMPLEX_4 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_4 * restrict bbase_y;
|
const GFC_COMPLEX_4 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_COMPLEX_8)
|
#if defined (HAVE_GFC_COMPLEX_8)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
|
||||||
|
const int *, const GFC_COMPLEX_8 *, const int *,
|
||||||
|
const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b);
|
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_c8);
|
export_proto(matmul_c8);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_c8 (gfc_array_c8 * const restrict retarray,
|
matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b)
|
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_8 * restrict abase;
|
const GFC_COMPLEX_8 * restrict abase;
|
||||||
const GFC_COMPLEX_8 * restrict bbase;
|
const GFC_COMPLEX_8 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_COMPLEX_8 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_COMPLEX_8 * restrict bbase_y;
|
const GFC_COMPLEX_8 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_16)
|
#if defined (HAVE_GFC_INTEGER_16)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
|
||||||
|
const int *, const GFC_INTEGER_16 *, const int *,
|
||||||
|
const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b);
|
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_i16);
|
export_proto(matmul_i16);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_i16 (gfc_array_i16 * const restrict retarray,
|
matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b)
|
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_16 * restrict abase;
|
const GFC_INTEGER_16 * restrict abase;
|
||||||
const GFC_INTEGER_16 * restrict bbase;
|
const GFC_INTEGER_16 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_INTEGER_16 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_16 * restrict bbase_y;
|
const GFC_INTEGER_16 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_4)
|
#if defined (HAVE_GFC_INTEGER_4)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
|
||||||
|
const int *, const GFC_INTEGER_4 *, const int *,
|
||||||
|
const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b);
|
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_i4);
|
export_proto(matmul_i4);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_i4 (gfc_array_i4 * const restrict retarray,
|
matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b)
|
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_4 * restrict abase;
|
const GFC_INTEGER_4 * restrict abase;
|
||||||
const GFC_INTEGER_4 * restrict bbase;
|
const GFC_INTEGER_4 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_INTEGER_4 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_4 * restrict bbase_y;
|
const GFC_INTEGER_4 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_INTEGER_8)
|
#if defined (HAVE_GFC_INTEGER_8)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
|
||||||
|
const int *, const GFC_INTEGER_8 *, const int *,
|
||||||
|
const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b);
|
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_i8);
|
export_proto(matmul_i8);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_i8 (gfc_array_i8 * const restrict retarray,
|
matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b)
|
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_8 * restrict abase;
|
const GFC_INTEGER_8 * restrict abase;
|
||||||
const GFC_INTEGER_8 * restrict bbase;
|
const GFC_INTEGER_8 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_INTEGER_8 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_INTEGER_8 * restrict bbase_y;
|
const GFC_INTEGER_8 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_10)
|
#if defined (HAVE_GFC_REAL_10)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
|
||||||
|
const int *, const GFC_REAL_10 *, const int *,
|
||||||
|
const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b);
|
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_r10);
|
export_proto(matmul_r10);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_r10 (gfc_array_r10 * const restrict retarray,
|
matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b)
|
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_REAL_10 * restrict abase;
|
const GFC_REAL_10 * restrict abase;
|
||||||
const GFC_REAL_10 * restrict bbase;
|
const GFC_REAL_10 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_REAL_10 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_REAL_10 * restrict bbase_y;
|
const GFC_REAL_10 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_16)
|
#if defined (HAVE_GFC_REAL_16)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
|
||||||
|
const int *, const GFC_REAL_16 *, const int *,
|
||||||
|
const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b);
|
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_r16);
|
export_proto(matmul_r16);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_r16 (gfc_array_r16 * const restrict retarray,
|
matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b)
|
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_REAL_16 * restrict abase;
|
const GFC_REAL_16 * restrict abase;
|
||||||
const GFC_REAL_16 * restrict bbase;
|
const GFC_REAL_16 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_REAL_16 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_REAL_16 * restrict bbase_y;
|
const GFC_REAL_16 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_4)
|
#if defined (HAVE_GFC_REAL_4)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
|
||||||
|
const int *, const GFC_REAL_4 *, const int *,
|
||||||
|
const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b);
|
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_r4);
|
export_proto(matmul_r4);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_r4 (gfc_array_r4 * const restrict retarray,
|
matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b)
|
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_REAL_4 * restrict abase;
|
const GFC_REAL_4 * restrict abase;
|
||||||
const GFC_REAL_4 * restrict bbase;
|
const GFC_REAL_4 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_REAL_4 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_REAL_4 * restrict bbase_y;
|
const GFC_REAL_4 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
|
||||||
|
|
||||||
#if defined (HAVE_GFC_REAL_8)
|
#if defined (HAVE_GFC_REAL_8)
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
|
||||||
|
const int *, const GFC_REAL_8 *, const int *,
|
||||||
|
const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b);
|
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_r8);
|
export_proto(matmul_r8);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_r8 (gfc_array_r8 * const restrict retarray,
|
matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b)
|
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const GFC_REAL_8 * restrict abase;
|
const GFC_REAL_8 * restrict abase;
|
||||||
const GFC_REAL_8 * restrict bbase;
|
const GFC_REAL_8 * restrict bbase;
|
||||||
|
|
@ -177,6 +193,31 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const GFC_REAL_8 one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const GFC_REAL_8 * restrict bbase_y;
|
const GFC_REAL_8 * restrict bbase_y;
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,16 @@ include(iparm.m4)dnl
|
||||||
|
|
||||||
`#if defined (HAVE_'rtype_name`)'
|
`#if defined (HAVE_'rtype_name`)'
|
||||||
|
|
||||||
|
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
|
||||||
|
passed to us by the front-end, in which case we'll call it for large
|
||||||
|
matrices. */
|
||||||
|
|
||||||
|
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
|
||||||
|
const int *, const rtype_name *, const rtype_name *,
|
||||||
|
const int *, const rtype_name *, const int *,
|
||||||
|
const rtype_name *, rtype_name *, const int *,
|
||||||
|
int, int);
|
||||||
|
|
||||||
/* The order of loops is different in the case of plain matrix
|
/* The order of loops is different in the case of plain matrix
|
||||||
multiplication C=MATMUL(A,B), and in the frequent special case where
|
multiplication C=MATMUL(A,B), and in the frequent special case where
|
||||||
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
the argument A is the temporary result of a TRANSPOSE intrinsic:
|
||||||
|
|
@ -57,18 +67,24 @@ include(iparm.m4)dnl
|
||||||
DO I=1,M
|
DO I=1,M
|
||||||
S = 0
|
S = 0
|
||||||
DO K=1,COUNT
|
DO K=1,COUNT
|
||||||
S = S+A(I,K)+B(K,J)
|
S = S+A(I,K)*B(K,J)
|
||||||
C(I,J) = S
|
C(I,J) = S
|
||||||
ENDIF
|
ENDIF
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/* If try_blas is set to a nonzero value, then the matmul function will
|
||||||
|
see if there is a way to perform the matrix multiplication by a call
|
||||||
|
to the BLAS gemm function. */
|
||||||
|
|
||||||
extern void matmul_`'rtype_code (rtype * const restrict retarray,
|
extern void matmul_`'rtype_code (rtype * const restrict retarray,
|
||||||
rtype * const restrict a, rtype * const restrict b);
|
rtype * const restrict a, rtype * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm);
|
||||||
export_proto(matmul_`'rtype_code);
|
export_proto(matmul_`'rtype_code);
|
||||||
|
|
||||||
void
|
void
|
||||||
matmul_`'rtype_code (rtype * const restrict retarray,
|
matmul_`'rtype_code (rtype * const restrict retarray,
|
||||||
rtype * const restrict a, rtype * const restrict b)
|
rtype * const restrict a, rtype * const restrict b, int try_blas,
|
||||||
|
int blas_limit, blas_call gemm)
|
||||||
{
|
{
|
||||||
const rtype_name * restrict abase;
|
const rtype_name * restrict abase;
|
||||||
const rtype_name * restrict bbase;
|
const rtype_name * restrict bbase;
|
||||||
|
|
@ -179,6 +195,31 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
|
||||||
bbase = b->data;
|
bbase = b->data;
|
||||||
dest = retarray->data;
|
dest = retarray->data;
|
||||||
|
|
||||||
|
|
||||||
|
/* Now that everything is set up, we're performing the multiplication
|
||||||
|
itself. */
|
||||||
|
|
||||||
|
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
|
||||||
|
|
||||||
|
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
|
||||||
|
&& (bxstride == 1 || bystride == 1)
|
||||||
|
&& (((float) xcount) * ((float) ycount) * ((float) count)
|
||||||
|
> POW3(blas_limit)))
|
||||||
|
{
|
||||||
|
const int m = xcount, n = ycount, k = count, ldc = rystride;
|
||||||
|
const rtype_name one = 1, zero = 0;
|
||||||
|
const int lda = (axstride == 1) ? aystride : axstride,
|
||||||
|
ldb = (bxstride == 1) ? bystride : bxstride;
|
||||||
|
|
||||||
|
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
|
||||||
|
{
|
||||||
|
assert (gemm != NULL);
|
||||||
|
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
|
||||||
|
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
if (rxstride == 1 && axstride == 1 && bxstride == 1)
|
||||||
{
|
{
|
||||||
const rtype_name * restrict bbase_y;
|
const rtype_name * restrict bbase_y;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue