re PR fortran/29550 (Optimize -fexternal-blas calls for conjg())

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.h (gfc_expr): Add external_blas flag.
	* frontend-passes.c (matrix_case): Add case A2TB2T.
	(optimize_namespace): Handle flag_external_blas by
	calling call_external_blas.
	(get_array_inq_function): Add argument okind. If
	it is nonzero, use it as the kind of argument
	to be used.
	(inline_limit_check): Remove m_case argument, add
	limit argument instead.  Remove assert about m_case.
	Set the limit for inlining from the limit argument.
	(matmul_lhs_realloc): Handle case A2TB2T.
	(inline_matmul_assign): Handle inline limit for other cases with
	two rank-two matrices.  Remove no-op calls to inline_limit_check.
	(call_external_blas): New function.
	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Do not add
	argument to external BLAS if external_blas is already set.

2018-09-18  Thomas Koenig  <tkoenig@gcc.gnu.org>

	PR fortran/29550
	* gfortran.dg/inline_matmul_13.f90: Adjust count for
	_gfortran_matmul.
	* gfortran.dg/inline_matmul_16.f90: Likewise.
	* gfortran.dg/promotion_2.f90: Add -fblas-matmul-limit=1.  Scan
	for dgemm instead of dgemm_.  Add call to random_number to make
	standard conforming.
	* gfortran.dg/matmul_blas_1.f90: New test.
	* gfortran.dg/matmul_bounds_14.f: New test.
	* gfortran.dg/matmul_bounds_15.f: New test.
	* gfortran.dg/matmul_bounds_16.f: New test.
	* gfortran.dg/blas_gemm_routines.f: New test / additional file for
	preceding tests.

From-SVN: r264412
This commit is contained in:
Thomas Koenig 2018-09-18 20:18:09 +00:00
parent 5c470e0f07
commit 998511a610
11 changed files with 2701 additions and 22 deletions

View File

@ -53,6 +53,7 @@ static gfc_code * create_do_loop (gfc_expr *, gfc_expr *, gfc_expr *,
char *vname=NULL); char *vname=NULL);
static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *, static gfc_expr* check_conjg_transpose_variable (gfc_expr *, bool *,
bool *); bool *);
static int call_external_blas (gfc_code **, int *, void *);
static bool has_dimen_vector_ref (gfc_expr *); static bool has_dimen_vector_ref (gfc_expr *);
static int matmul_temp_args (gfc_code **, int *,void *data); static int matmul_temp_args (gfc_code **, int *,void *data);
static int index_interchange (gfc_code **, int*, void *); static int index_interchange (gfc_code **, int*, void *);
@ -131,7 +132,7 @@ static int var_num = 1;
/* What sort of matrix we are dealing with when inlining MATMUL. */ /* What sort of matrix we are dealing with when inlining MATMUL. */
enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2 }; enum matrix_case { none=0, A2B2, A2B1, A1B2, A2B2T, A2TB2, A2TB2T };
/* Keep track of the number of expressions we have inserted so far /* Keep track of the number of expressions we have inserted so far
using create_var. */ using create_var. */
@ -1428,7 +1429,7 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL); gfc_code_walker (&ns->code, convert_elseif, dummy_expr_callback, NULL);
gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL); gfc_code_walker (&ns->code, cfe_code, cfe_expr_0, NULL);
gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL); gfc_code_walker (&ns->code, optimize_code, optimize_expr, NULL);
if (flag_inline_matmul_limit != 0) if (flag_inline_matmul_limit != 0 || flag_external_blas)
{ {
bool found; bool found;
do do
@ -1441,9 +1442,15 @@ optimize_namespace (gfc_namespace *ns)
gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback, gfc_code_walker (&ns->code, matmul_temp_args, dummy_expr_callback,
NULL); NULL);
gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
NULL);
} }
if (flag_external_blas)
gfc_code_walker (&ns->code, call_external_blas, dummy_expr_callback,
NULL);
if (flag_inline_matmul_limit != 0)
gfc_code_walker (&ns->code, inline_matmul_assign, dummy_expr_callback,
NULL);
} }
if (flag_frontend_loop_interchange) if (flag_frontend_loop_interchange)
@ -2938,7 +2945,7 @@ matmul_temp_args (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
dim is zero-based. */ dim is zero-based. */
static gfc_expr * static gfc_expr *
get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim) get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim, int okind = 0)
{ {
gfc_expr *fcn; gfc_expr *fcn;
gfc_expr *dim_arg, *kind; gfc_expr *dim_arg, *kind;
@ -2964,8 +2971,12 @@ get_array_inq_function (gfc_isym_id id, gfc_expr *e, int dim)
} }
dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim); dim_arg = gfc_get_int_expr (gfc_default_integer_kind, &e->where, dim);
kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where, if (okind != 0)
gfc_index_integer_kind); kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
okind);
else
kind = gfc_get_int_expr (gfc_default_integer_kind, &e->where,
gfc_index_integer_kind);
ec = gfc_copy_expr (e); ec = gfc_copy_expr (e);
@ -3026,7 +3037,7 @@ get_operand (gfc_intrinsic_op op, gfc_expr *e1, gfc_expr *e2)
removed by DCE. Only called for rank-two matrices A and B. */ removed by DCE. Only called for rank-two matrices A and B. */
static gfc_code * static gfc_code *
inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case) inline_limit_check (gfc_expr *a, gfc_expr *b, int limit)
{ {
gfc_expr *inline_limit; gfc_expr *inline_limit;
gfc_code *if_1, *if_2, *else_2; gfc_code *if_1, *if_2, *else_2;
@ -3034,14 +3045,11 @@ inline_limit_check (gfc_expr *a, gfc_expr *b, enum matrix_case m_case)
gfc_typespec ts; gfc_typespec ts;
gfc_expr *cond; gfc_expr *cond;
gcc_assert (m_case == A2B2 || m_case == A2B2T || m_case == A2TB2);
/* Calculation is done in real to avoid integer overflow. */ /* Calculation is done in real to avoid integer overflow. */
inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind, inline_limit = gfc_get_constant_expr (BT_REAL, gfc_default_real_kind,
&a->where); &a->where);
mpfr_set_si (inline_limit->value.real, flag_inline_matmul_limit, mpfr_set_si (inline_limit->value.real, limit, GFC_RND_MODE);
GFC_RND_MODE);
mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3, mpfr_pow_ui (inline_limit->value.real, inline_limit->value.real, 3,
GFC_RND_MODE); GFC_RND_MODE);
@ -3235,6 +3243,22 @@ matmul_lhs_realloc (gfc_expr *c, gfc_expr *a, gfc_expr *b,
get_array_inq_function (GFC_ISYM_SIZE, b, 2)); get_array_inq_function (GFC_ISYM_SIZE, b, 2));
break; break;
case A2TB2T:
/* This can only happen for BLAS, we do not handle that case in
inline mamtul. */
ar->start[0] = get_array_inq_function (GFC_ISYM_SIZE, a, 2);
ar->start[1] = get_array_inq_function (GFC_ISYM_SIZE, b, 1);
ne1 = build_logical_expr (INTRINSIC_NE,
get_array_inq_function (GFC_ISYM_SIZE, c, 1),
get_array_inq_function (GFC_ISYM_SIZE, a, 2));
ne2 = build_logical_expr (INTRINSIC_NE,
get_array_inq_function (GFC_ISYM_SIZE, c, 2),
get_array_inq_function (GFC_ISYM_SIZE, b, 1));
cond = build_logical_expr (INTRINSIC_OR, ne1, ne2);
break;
default: default:
gcc_unreachable(); gcc_unreachable();
@ -3946,9 +3970,11 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
/* Take care of the inline flag. If the limit check evaluates to a /* Take care of the inline flag. If the limit check evaluates to a
constant, dead code elimination will eliminate the unneeded branch. */ constant, dead code elimination will eliminate the unneeded branch. */
if (m_case == A2B2 && flag_inline_matmul_limit > 0) if (flag_inline_matmul_limit > 0 && matrix_a->rank == 2
&& matrix_b->rank == 2)
{ {
if_limit = inline_limit_check (matrix_a, matrix_b, m_case); if_limit = inline_limit_check (matrix_a, matrix_b,
flag_inline_matmul_limit);
/* Insert the original statement into the else branch. */ /* Insert the original statement into the else branch. */
if_limit->block->block->next = co; if_limit->block->block->next = co;
@ -4118,7 +4144,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
switch (m_case) switch (m_case)
{ {
case A2B2: case A2B2:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 2); u1 = get_size_m1 (matrix_b, 2);
u2 = get_size_m1 (matrix_a, 2); u2 = get_size_m1 (matrix_a, 2);
@ -4151,7 +4176,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break; break;
case A2B2T: case A2B2T:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_b, 1); u1 = get_size_m1 (matrix_b, 1);
u2 = get_size_m1 (matrix_a, 2); u2 = get_size_m1 (matrix_a, 2);
@ -4184,7 +4208,6 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
break; break;
case A2TB2: case A2TB2:
inline_limit_check (matrix_a, matrix_b, m_case);
u1 = get_size_m1 (matrix_a, 2); u1 = get_size_m1 (matrix_a, 2);
u2 = get_size_m1 (matrix_b, 2); u2 = get_size_m1 (matrix_b, 2);
@ -4311,6 +4334,405 @@ inline_matmul_assign (gfc_code **c, int *walk_subtrees,
return 0; return 0;
} }
/* Change matmul function calls in the form of
c = matmul(a,b)
to the corresponding call to a BLAS routine, if applicable. */
static int
call_external_blas (gfc_code **c, int *walk_subtrees ATTRIBUTE_UNUSED,
void *data ATTRIBUTE_UNUSED)
{
gfc_code *co, *co_next;
gfc_expr *expr1, *expr2;
gfc_expr *matrix_a, *matrix_b;
gfc_code *if_limit = NULL;
gfc_actual_arglist *a, *b;
bool conjg_a, conjg_b, transpose_a, transpose_b;
gfc_code *call;
const char *blas_name;
const char *transa, *transb;
gfc_expr *c1, *c2, *b1;
gfc_actual_arglist *actual, *next;
bt type;
int kind;
enum matrix_case m_case;
bool realloc_c;
gfc_code **next_code_point;
/* Many of the tests for inline matmul also apply here. */
co = *c;
if (co->op != EXEC_ASSIGN)
return 0;
if (in_where || in_assoc_list)
return 0;
/* The BLOCKS generated for the temporary variables and FORALL don't
mix. */
if (forall_level > 0)
return 0;
/* For now don't do anything in OpenMP workshare, it confuses
its translation, which expects only the allowed statements in there. */
if (in_omp_workshare)
return 0;
expr1 = co->expr1;
expr2 = co->expr2;
if (expr2->expr_type != EXPR_FUNCTION
|| expr2->value.function.isym == NULL
|| expr2->value.function.isym->id != GFC_ISYM_MATMUL)
return 0;
type = expr2->ts.type;
kind = expr2->ts.kind;
/* Guard against recursion. */
if (expr2->external_blas)
return 0;
if (type != expr1->ts.type || kind != expr1->ts.kind)
return 0;
if (type == BT_REAL)
{
if (kind == 4)
blas_name = "sgemm";
else if (kind == 8)
blas_name = "dgemm";
else
return 0;
}
else if (type == BT_COMPLEX)
{
if (kind == 4)
blas_name = "cgemm";
else if (kind == 8)
blas_name = "zgemm";
else
return 0;
}
else
return 0;
a = expr2->value.function.actual;
if (a->expr->rank != 2)
return 0;
b = a->next;
if (b->expr->rank != 2)
return 0;
matrix_a = check_conjg_transpose_variable (a->expr, &conjg_a, &transpose_a);
if (matrix_a == NULL)
return 0;
if (transpose_a)
{
if (conjg_a)
transa = "C";
else
transa = "T";
}
else
transa = "N";
matrix_b = check_conjg_transpose_variable (b->expr, &conjg_b, &transpose_b);
if (matrix_b == NULL)
return 0;
if (transpose_b)
{
if (conjg_b)
transb = "C";
else
transb = "T";
}
else
transb = "N";
if (transpose_a)
{
if (transpose_b)
m_case = A2TB2T;
else
m_case = A2TB2;
}
else
{
if (transpose_b)
m_case = A2B2T;
else
m_case = A2B2;
}
current_code = c;
inserted_block = NULL;
changed_statement = NULL;
expr2->external_blas = 1;
/* We do not handle data dependencies yet. */
if (gfc_check_dependency (expr1, matrix_a, true)
|| gfc_check_dependency (expr1, matrix_b, true))
return 0;
/* Generate the if statement and hang it into the tree. */
if_limit = inline_limit_check (matrix_a, matrix_b, flag_blas_matmul_limit);
co_next = co->next;
(*current_code) = if_limit;
co->next = NULL;
if_limit->block->next = co;
call = XCNEW (gfc_code);
call->loc = co->loc;
/* Bounds checking - a bit simpler than for inlining since we only
have to take care of two-dimensional arrays here. */
realloc_c = flag_realloc_lhs && gfc_is_reallocatable_lhs (expr1);
next_code_point = &(if_limit->block->block->next);
if (gfc_option.rtcheck & GFC_RTCHECK_BOUNDS)
{
gfc_code *test;
// gfc_expr *a2, *b1, *c1, *c2, *a1, *b2;
gfc_expr *c1, *a1, *c2, *b2, *a2;
switch (m_case)
{
case A2B2:
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (b1, a2, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (c1, a1, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
test = runtime_error_ne (c2, b2, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2B2T:
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
/* matrix_b is transposed, hence dimension 1 for the error message. */
test = runtime_error_ne (b2, a2, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (c1, a1, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
test = runtime_error_ne (c2, b1, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2TB2:
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (b1, a1, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (c1, a2, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
test = runtime_error_ne (c2, b2, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
case A2TB2T:
b2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 2);
a1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 1);
test = runtime_error_ne (b2, a1, B_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
if (!realloc_c)
{
c1 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 1);
a2 = get_array_inq_function (GFC_ISYM_SIZE, matrix_a, 2);
test = runtime_error_ne (c1, a2, C_ERROR(1));
*next_code_point = test;
next_code_point = &test->next;
c2 = get_array_inq_function (GFC_ISYM_SIZE, expr1, 2);
b1 = get_array_inq_function (GFC_ISYM_SIZE, matrix_b, 1);
test = runtime_error_ne (c2, b1, C_ERROR(2));
*next_code_point = test;
next_code_point = &test->next;
}
break;
default:
gcc_unreachable ();
}
}
/* Handle the reallocation, if needed. */
if (realloc_c)
{
gfc_code *lhs_alloc;
lhs_alloc = matmul_lhs_realloc (expr1, matrix_a, matrix_b, m_case);
*next_code_point = lhs_alloc;
next_code_point = &lhs_alloc->next;
}
*next_code_point = call;
if_limit->next = co_next;
/* Set up the BLAS call. */
call->op = EXEC_CALL;
gfc_get_sym_tree (blas_name, current_ns, &(call->symtree), true);
call->symtree->n.sym->attr.subroutine = 1;
call->symtree->n.sym->attr.procedure = 1;
call->symtree->n.sym->attr.flavor = FL_PROCEDURE;
call->resolved_sym = call->symtree->n.sym;
/* Argument TRANSA. */
next = gfc_get_actual_arglist ();
next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
transa, 1);
call->ext.actual = next;
/* Argument TRANSB. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_character_expr (gfc_default_character_kind, &co->loc,
transb, 1);
actual->next = next;
c1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (a->expr), 1,
gfc_integer_4_kind);
c2 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 2,
gfc_integer_4_kind);
b1 = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (b->expr), 1,
gfc_integer_4_kind);
/* Argument M. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = c1;
actual->next = next;
/* Argument N. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = c2;
actual->next = next;
/* Argument K. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = b1;
actual->next = next;
/* Argument ALPHA - set to one. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_constant_expr (type, kind, &co->loc);
if (type == BT_REAL)
mpfr_set_ui (next->expr->value.real, 1, GFC_RND_MODE);
else
mpc_set_ui (next->expr->value.complex, 1, GFC_MPC_RND_MODE);
actual->next = next;
/* Argument A. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (matrix_a);
actual->next = next;
/* Argument LDA. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_a),
1, gfc_integer_4_kind);
actual->next = next;
/* Argument B. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (matrix_b);
actual->next = next;
/* Argument LDB. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (matrix_b),
1, gfc_integer_4_kind);
actual->next = next;
/* Argument BETA - set to zero. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_get_constant_expr (type, kind, &co->loc);
if (type == BT_REAL)
mpfr_set_ui (next->expr->value.real, 0, GFC_RND_MODE);
else
mpc_set_ui (next->expr->value.complex, 0, GFC_MPC_RND_MODE);
actual->next = next;
/* Argument C. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = gfc_copy_expr (expr1);
actual->next = next;
/* Argument LDC. */
actual = next;
next = gfc_get_actual_arglist ();
next->expr = get_array_inq_function (GFC_ISYM_SIZE, gfc_copy_expr (expr1),
1, gfc_integer_4_kind);
actual->next = next;
return 0;
}
/* Code for index interchange for loops which are grouped together in DO /* Code for index interchange for loops which are grouped together in DO
CONCURRENT or FORALL statements. This is currently only applied if the CONCURRENT or FORALL statements. This is currently only applied if the

View File

@ -2143,6 +2143,11 @@ typedef struct gfc_expr
unsigned int no_bounds_check : 1; unsigned int no_bounds_check : 1;
/* Set this if a matmul expression has already been evaluated for conversion
to a BLAS call. */
unsigned int external_blas : 1;
/* If an expression comes from a Hollerith constant or compile-time /* If an expression comes from a Hollerith constant or compile-time
evaluation of a transfer statement, it may have a prescribed target- evaluation of a transfer statement, it may have a prescribed target-
memory representation, and these cannot always be backformed from memory representation, and these cannot always be backformed from

View File

@ -4055,6 +4055,7 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
to be able to call the BLAS ?gemm functions if required and possible. */ to be able to call the BLAS ?gemm functions if required and possible. */
append_args = NULL; append_args = NULL;
if (expr->value.function.isym->id == GFC_ISYM_MATMUL if (expr->value.function.isym->id == GFC_ISYM_MATMUL
&& !expr->external_blas
&& sym->ts.type != BT_LOGICAL) && sym->ts.type != BT_LOGICAL)
{ {
tree cint = gfc_get_int_type (gfc_c_int_kind); tree cint = gfc_get_int_type (gfc_c_int_kind);

File diff suppressed because it is too large Load Diff

View File

@ -44,4 +44,4 @@ program main
deallocate(calloc) deallocate(calloc)
end program main end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "original" } } ! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "original" } }

View File

@ -58,4 +58,4 @@ program main
end do end do
end do end do
end program main end program main
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } } ! { dg-final { scan-tree-dump-times "_gfortran_matmul" 1 "optimized" } }

View File

@ -0,0 +1,240 @@
C { dg-do run }
C { dg-options "-fcheck=bounds -fdump-tree-optimized -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-additional-sources blas_gemm_routines.f }
C Test calling of BLAS routines
program main
call sub_s
call sub_d
call sub_c
call sub_z
end
subroutine sub_d
implicit none
real(8), dimension(3,2) :: a
real(8), dimension(2,3) :: at
real(8), dimension(2,4) :: b
real(8), dimension(4,2) :: bt
real(8), dimension(3,4) :: c
real(8), dimension(3,4) :: cres
real(8), dimension(:,:), allocatable :: c_alloc
data a / 2., -3., 5., -7., 11., -13./
data b /17., -23., 29., -31., 37., -39., 41., -47./
data cres /195., -304., 384., 275., -428., 548., 347., -540.,
& 692., 411., -640., 816./
c = matmul(a,b)
if (any (c /= cres)) stop 31
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 32
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 33
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 34
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 35
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 36
end
subroutine sub_s
implicit none
real, dimension(3,2) :: a
real, dimension(2,3) :: at
real, dimension(2,4) :: b
real, dimension(4,2) :: bt
real, dimension(3,4) :: c
real, dimension(3,4) :: cres
real, dimension(:,:), allocatable :: c_alloc
data a / 2., -3., 5., -7., 11., -13./
data b /17., -23., 29., -31., 37., -39., 41., -47./
data cres /195., -304., 384., 275., -428., 548., 347., -540.,
& 692., 411., -640., 816./
c = matmul(a,b)
if (any (c /= cres)) stop 21
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 22
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 23
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 24
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 25
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 26
end
subroutine sub_c
implicit none
complex, dimension(3,2) :: a
complex, dimension(2,3) :: at, ah
complex, dimension(2,4) :: b
complex, dimension(4,2) :: bt, bh
complex, dimension(3,4) :: c
complex, dimension(3,4) :: cres
complex, dimension(:,:), allocatable :: c_alloc
data a / (2.,-3.), (-5.,7.), (11.,-13.), (17.,19), (-23., -29),
& (-31., 37.)/
data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
& ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
& (-2789.,-11.),
& (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
& (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
c = matmul(a,b)
if (any (c /= cres)) stop 1
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 2
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 3
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 4
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 5
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 6
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 7
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 8
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 9
deallocate (c_alloc)
allocate (c_alloc(0,0))
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 10
end
subroutine sub_z
implicit none
complex(8), dimension(3,2) :: a
complex(8), dimension(2,3) :: at, ah
complex(8), dimension(2,4) :: b
complex(8), dimension(4,2) :: bt, bh
complex(8), dimension(3,4) :: c
complex(8), dimension(3,4) :: cres
complex(8), dimension(:,:), allocatable :: c_alloc
data a / (2.,-3.), (-5._8,7.), (11.,-13.), (17.,19),
& (-23., -29), (-31., 37.)/
data b / (-41., 43.), (-47., 53.), (-59.,-61.), (-67., 71),
& ( 73.,79. ), (83.,-89.), (97.,-101.), (-107.,-109.)/
data cres /(-1759.,217.), (2522.,-358.), (-396.,-2376.),
& (-2789.,-11.),
& (4322.,202.), (-1992.,-4584.), (3485.,3.), (-5408.,-244.),
& (2550.,5750.), (143.,-4379.), (-478.,6794.), (7104.,-2952.) /
c = matmul(a,b)
if (any (c /= cres)) stop 11
at = transpose(a)
c = (1.2,-2.2)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 12
bt = transpose(b)
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 13
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 14
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 15
c_alloc = matmul(a,b)
if (any (c /= cres)) stop 16
at = transpose(a)
deallocate (c_alloc)
c = matmul(transpose(at), b)
if (any (c /= cres)) stop 17
bt = transpose(b)
allocate (c_alloc(20,20))
c = (1.2,-2.1)
c = matmul(a, transpose(bt))
if (any (c /= cres)) stop 18
ah = transpose(conjg(a))
c = (1.2,-2.2)
c = matmul(conjg(transpose(ah)), b)
if (any (c /= cres)) stop 19
deallocate (c_alloc)
allocate (c_alloc(0,0))
bh = transpose(conjg(b))
c = (1.2,-2.2)
c = matmul(a, transpose(conjg(bh)))
if (any (c /= cres)) stop 20
end
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }

View File

@ -0,0 +1,16 @@
C { dg-do run }
C { dg-options "-fno-realloc-lhs -fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Array bound mismatch for dimension 2 of array." }
C { dg-additional-sources blas_gemm_routines.f }
program main
real, dimension(3,2) :: a
real, dimension(2,3) :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
b = 2.3
allocate(ret(3,2))
ret = matmul(a,b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Array bound mismatch for dimension 2 of array.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }

View File

@ -0,0 +1,19 @@
C { dg-do run }
C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
C { dg-additional-sources blas_gemm_routines.f }
program main
character(len=20) :: line
integer :: n, m
real, dimension(3,2) :: a
real, dimension(:,:), allocatable :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
line = '3 3'
read (unit=line,fmt=*) n, m
allocate (b(n,m))
b = 2.3
ret = matmul(a,b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }

View File

@ -0,0 +1,20 @@
C { dg-do run }
C { dg-options "-fdump-tree-optimized -fcheck=bounds -fblas-matmul-limit=1 -O -fexternal-blas" }
C { dg-shouldfail "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1" }
C { dg-additional-sources blas_gemm_routines.f }
program main
character(len=20) :: line
integer :: n, m
real, dimension(3,2) :: a
real, dimension(:,:), allocatable :: b
real, dimension(:,:), allocatable :: ret
a = 1.0
line = '4 3'
read (unit=line,fmt=*) n, m
allocate (b(n,m))
b = 2.3
ret = matmul(transpose(a),b) ! This should throw an error.
end
! { dg-output "Fortran runtime error: Incorrect extent in argument B in MATMUL intrinsic in dimension 1.*" }
! { dg-final { scan-tree-dump-times "_gfortran_matmul" 0 "optimized" } }

View File

@ -1,5 +1,5 @@
! { dg-do compile } ! { dg-do compile }
! { dg-options "-fdefault-real-8 -fexternal-blas -fdump-tree-original -finline-matmul-limit=0" } ! { dg-options "-fdefault-real-8 -fexternal-blas -fblas-matmul-limit=1 -fdump-tree-original -finline-matmul-limit=0" }
! !
! PR fortran/54463 ! PR fortran/54463
! !
@ -8,8 +8,9 @@
program test program test
implicit none implicit none
real, dimension(3,3) :: A real, dimension(3,3) :: A
call random_number(a)
A = matmul(A,A) A = matmul(A,A)
end program test end program test
! { dg-final { scan-tree-dump-times "sgemm_" 0 "original" } } ! { dg-final { scan-tree-dump-times "sgemm" 0 "original" } }
! { dg-final { scan-tree-dump-times "dgemm_" 1 "original" } } ! { dg-final { scan-tree-dump-times "dgemm" 1 "original" } }