mirror of git://gcc.gnu.org/git/gcc.git
Optimise sqrt reciprocal multiplications
This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations.
In particular consider:
x = 1.0 / sqrt (a);
r1 = x * x; // same as 1.0 / a
r2 = a * x; // same as sqrt (a)
If x, r1 and r2 are all used further on in the code, this can be transformed into:
tmp1 = 1.0 / a
tmp2 = sqrt (a)
tmp3 = tmp1 * tmp2
x = tmp3
r1 = tmp1
r2 = tmp2
A bit convoluted, but this saves us one multiplication and, more importantly, the sqrt and division are now independent.
This also allows optimisation of a subset of these expressions.
For example:
x = 1.0 / sqrt (a)
r1 = x * x
can be transformed to r1 = 1.0 / a, eliminating the sqrt if x is not used anywhere else.
And similarly:
x = 1.0 / sqrt (a)
r1 = a * x
can be transformed to sqrt (a) eliminating the division.
For the testcase:
double res, res2, tmp;
void
foo (double a, double b)
{
tmp = 1.0 / __builtin_sqrt (a);
res = tmp * tmp;
res2 = a * tmp;
}
We now generate for aarch64 with -Ofast:
foo:
fmov d2, 1.0e+0
adrp x2, res2
fsqrt d1, d0
adrp x1, res
fdiv d0, d2, d0
adrp x0, tmp
str d1, [x2, #:lo12:res2]
fmul d1, d1, d0
str d0, [x1, #:lo12:res]
str d1, [x0, #:lo12:tmp]
ret
where before it generated:
foo:
fsqrt d2, d0
fmov d1, 1.0e+0
adrp x1, res2
adrp x2, tmp
adrp x0, res
fdiv d1, d1, d2
fmul d0, d1, d0
fmul d2, d1, d1
str d1, [x2, #:lo12:tmp]
str d0, [x1, #:lo12:res2]
str d2, [x0, #:lo12:res]
ret
As you can see, the new sequence has one fewer multiply and the fsqrt and fdiv are independent.
* tree-ssa-math-opts.c (is_mult_by): New function.
(is_square_of): Use the above.
(optimize_recip_sqrt): New function.
(pass_cse_reciprocals::execute): Use the above.
* gcc.dg/recip_sqrt_mult_1.c: New test.
* gcc.dg/recip_sqrt_mult_2.c: Likewise.
* gcc.dg/recip_sqrt_mult_3.c: Likewise.
* gcc.dg/recip_sqrt_mult_4.c: Likewise.
* gcc.dg/recip_sqrt_mult_5.c: Likewise.
* g++.dg/recip_sqrt_mult_1.C: Likewise.
* g++.dg/recip_sqrt_mult_2.C: Likewise.
From-SVN: r264126
This commit is contained in:
parent
76a5eae549
commit
24c4943149
|
|
@ -1,3 +1,10 @@
|
|||
2018-09-05 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
|
||||
|
||||
* tree-ssa-math-opts.c (is_mult_by): New function.
|
||||
(is_square_of): Use the above.
|
||||
(optimize_recip_sqrt): New function.
|
||||
(pass_cse_reciprocals::execute): Use the above.
|
||||
|
||||
2018-09-05 Richard Biener <rguenther@suse.de>
|
||||
|
||||
PR bootstrap/87134
|
||||
|
|
|
|||
|
|
@ -1,3 +1,13 @@
|
|||
2018-09-05 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
|
||||
|
||||
* gcc.dg/recip_sqrt_mult_1.c: New test.
|
||||
* gcc.dg/recip_sqrt_mult_2.c: Likewise.
|
||||
* gcc.dg/recip_sqrt_mult_3.c: Likewise.
|
||||
* gcc.dg/recip_sqrt_mult_4.c: Likewise.
|
||||
* gcc.dg/recip_sqrt_mult_5.c: Likewise.
|
||||
* g++.dg/recip_sqrt_mult_1.C: Likewise.
|
||||
* g++.dg/recip_sqrt_mult_2.C: Likewise.
|
||||
|
||||
2018-09-05 Martin Liska <mliska@suse.cz>
|
||||
|
||||
PR tree-optimization/87205
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
|
||||
|
||||
double res, res2, tmp;
|
||||
void
|
||||
foo1 (double a, double b)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
res2 = a * tmp;
|
||||
}
|
||||
catch (...)
|
||||
{ ; }
|
||||
}
|
||||
|
||||
void
|
||||
foo4 (double a, double b, int c, int d)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
if (c)
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
foo5 (double a, double b, int c, int d)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
catch (...)
|
||||
{ ; }
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */
|
||||
|
||||
/* Check that the recip_sqrt optimization does not trigger here, causing an
|
||||
ICE due to EH info. */
|
||||
|
||||
|
||||
double res, res2, tmp;
|
||||
void
|
||||
foo1 (double a, double b)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
res2 = a * tmp;
|
||||
}
|
||||
catch (...)
|
||||
{ ; }
|
||||
}
|
||||
|
||||
void
|
||||
foo4 (double a, double b, int c, int d)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
if (c)
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
foo5 (double a, double b, int c, int d)
|
||||
{
|
||||
try {
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
catch (...)
|
||||
{ ; }
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fdump-tree-recip" } */
|
||||
|
||||
double res, res2, tmp;
|
||||
void
|
||||
foo (double a, double b)
|
||||
{
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
res2 = a * tmp;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fdump-tree-optimized" } */
|
||||
|
||||
float
|
||||
foo (float a)
|
||||
{
|
||||
float tmp = 1.0f / __builtin_sqrtf (a);
|
||||
return a * tmp;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fdump-tree-optimized" } */
|
||||
|
||||
double
|
||||
foo (double a)
|
||||
{
|
||||
double tmp = 1.0f / __builtin_sqrt (a);
|
||||
return tmp * tmp;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fdump-tree-recip" } */
|
||||
|
||||
/* The main path doesn't have any multiplications.
|
||||
Avoid introducing them in the recip pass. */
|
||||
|
||||
double res, res2, tmp;
|
||||
void
|
||||
foo (double a, double b, int c, int d)
|
||||
{
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
if (c)
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
/* { dg-do compile } */
|
||||
/* { dg-options "-Ofast -fdump-tree-recip" } */
|
||||
|
||||
/* We want to do the recip_sqrt transformations here there is already
|
||||
a multiplication on the main path. */
|
||||
|
||||
double res, res2, tmp;
|
||||
void
|
||||
foo (double a, double b, int c, int d)
|
||||
{
|
||||
tmp = 1.0 / __builtin_sqrt (a);
|
||||
res = tmp * tmp;
|
||||
|
||||
if (d)
|
||||
res2 = a * tmp;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
|
||||
/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
|
||||
|
|
@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
|
|||
&& gimple_assign_rhs1 (use_stmt) != def;
|
||||
}
|
||||
|
||||
/* Return whether USE_STMT is DEF * DEF. */
|
||||
/* Return TRUE if USE_STMT is a multiplication of DEF by A. */
|
||||
static inline bool
|
||||
is_square_of (gimple *use_stmt, tree def)
|
||||
is_mult_by (gimple *use_stmt, tree def, tree a)
|
||||
{
|
||||
if (gimple_code (use_stmt) == GIMPLE_ASSIGN
|
||||
&& gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
|
||||
|
|
@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
|
|||
tree op0 = gimple_assign_rhs1 (use_stmt);
|
||||
tree op1 = gimple_assign_rhs2 (use_stmt);
|
||||
|
||||
return op0 == op1 && op0 == def;
|
||||
return (op0 == def && op1 == a)
|
||||
|| (op0 == a && op1 == def);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Return whether USE_STMT is DEF * DEF. */
|
||||
static inline bool
|
||||
is_square_of (gimple *use_stmt, tree def)
|
||||
{
|
||||
return is_mult_by (use_stmt, def, def);
|
||||
}
|
||||
|
||||
/* Return whether USE_STMT is a floating-point division by
|
||||
DEF * DEF. */
|
||||
static inline bool
|
||||
|
|
@ -526,6 +534,188 @@ free_bb (struct occurrence *occ)
|
|||
}
|
||||
}
|
||||
|
||||
/* Transform sequences like
|
||||
t = sqrt (a)
|
||||
x = 1.0 / t;
|
||||
r1 = x * x;
|
||||
r2 = a * x;
|
||||
into:
|
||||
t = sqrt (a)
|
||||
r1 = 1.0 / a;
|
||||
r2 = t;
|
||||
x = r1 * r2;
|
||||
depending on the uses of x, r1, r2. This removes one multiplication and
|
||||
allows the sqrt and division operations to execute in parallel.
|
||||
DEF_GSI is the gsi of the initial division by sqrt that defines
|
||||
DEF (x in the example abovs). */
|
||||
|
||||
static void
|
||||
optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
|
||||
{
|
||||
gimple *use_stmt;
|
||||
imm_use_iterator use_iter;
|
||||
gimple *stmt = gsi_stmt (*def_gsi);
|
||||
tree x = def;
|
||||
tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
|
||||
tree div_rhs1 = gimple_assign_rhs1 (stmt);
|
||||
|
||||
if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
|
||||
|| TREE_CODE (div_rhs1) != REAL_CST
|
||||
|| !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
|
||||
return;
|
||||
|
||||
gcall *sqrt_stmt
|
||||
= dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
|
||||
|
||||
if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
|
||||
return;
|
||||
|
||||
switch (gimple_call_combined_fn (sqrt_stmt))
|
||||
{
|
||||
CASE_CFN_SQRT:
|
||||
CASE_CFN_SQRT_FN:
|
||||
break;
|
||||
|
||||
default:
|
||||
return;
|
||||
}
|
||||
tree a = gimple_call_arg (sqrt_stmt, 0);
|
||||
|
||||
/* We have 'a' and 'x'. Now analyze the uses of 'x'. */
|
||||
|
||||
/* Statements that use x in x * x. */
|
||||
auto_vec<gimple *> sqr_stmts;
|
||||
/* Statements that use x in a * x. */
|
||||
auto_vec<gimple *> mult_stmts;
|
||||
bool has_other_use = false;
|
||||
bool mult_on_main_path = false;
|
||||
|
||||
FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
|
||||
{
|
||||
if (is_gimple_debug (use_stmt))
|
||||
continue;
|
||||
if (is_square_of (use_stmt, x))
|
||||
{
|
||||
sqr_stmts.safe_push (use_stmt);
|
||||
if (gimple_bb (use_stmt) == gimple_bb (stmt))
|
||||
mult_on_main_path = true;
|
||||
}
|
||||
else if (is_mult_by (use_stmt, x, a))
|
||||
{
|
||||
mult_stmts.safe_push (use_stmt);
|
||||
if (gimple_bb (use_stmt) == gimple_bb (stmt))
|
||||
mult_on_main_path = true;
|
||||
}
|
||||
else
|
||||
has_other_use = true;
|
||||
}
|
||||
|
||||
/* In the x * x and a * x cases we just rewire stmt operands or
|
||||
remove multiplications. In the has_other_use case we introduce
|
||||
a multiplication so make sure we don't introduce a multiplication
|
||||
on a path where there was none. */
|
||||
if (has_other_use && !mult_on_main_path)
|
||||
return;
|
||||
|
||||
if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
|
||||
return;
|
||||
|
||||
/* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
|
||||
to be able to compose it from the sqr and mult cases. */
|
||||
if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
|
||||
return;
|
||||
|
||||
if (dump_file)
|
||||
{
|
||||
fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
|
||||
print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
|
||||
print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
|
||||
fprintf (dump_file, "\n");
|
||||
}
|
||||
|
||||
bool delete_div = !has_other_use;
|
||||
tree sqr_ssa_name = NULL_TREE;
|
||||
if (!sqr_stmts.is_empty ())
|
||||
{
|
||||
/* r1 = x * x. Transform the original
|
||||
x = 1.0 / t
|
||||
into
|
||||
tmp1 = 1.0 / a
|
||||
r1 = tmp1. */
|
||||
|
||||
sqr_ssa_name
|
||||
= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
|
||||
|
||||
if (dump_file)
|
||||
{
|
||||
fprintf (dump_file, "Replacing original division\n");
|
||||
print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
|
||||
fprintf (dump_file, "with new division\n");
|
||||
}
|
||||
gimple_assign_set_lhs (stmt, sqr_ssa_name);
|
||||
gimple_assign_set_rhs2 (stmt, a);
|
||||
fold_stmt_inplace (def_gsi);
|
||||
update_stmt (stmt);
|
||||
|
||||
if (dump_file)
|
||||
print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
|
||||
|
||||
delete_div = false;
|
||||
gimple *sqr_stmt;
|
||||
unsigned int i;
|
||||
FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
|
||||
{
|
||||
gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
|
||||
gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
|
||||
update_stmt (sqr_stmt);
|
||||
}
|
||||
}
|
||||
if (!mult_stmts.is_empty ())
|
||||
{
|
||||
/* r2 = a * x. Transform this into:
|
||||
r2 = t (The original sqrt (a)). */
|
||||
unsigned int i;
|
||||
gimple *mult_stmt = NULL;
|
||||
FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
|
||||
{
|
||||
gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
|
||||
|
||||
if (dump_file)
|
||||
{
|
||||
fprintf (dump_file, "Replacing squaring multiplication\n");
|
||||
print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
|
||||
fprintf (dump_file, "with assignment\n");
|
||||
}
|
||||
gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
|
||||
fold_stmt_inplace (&gsi2);
|
||||
update_stmt (mult_stmt);
|
||||
if (dump_file)
|
||||
print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
if (has_other_use)
|
||||
{
|
||||
/* Using the two temporaries tmp1, tmp2 from above
|
||||
the original x is now:
|
||||
x = tmp1 * tmp2. */
|
||||
gcc_assert (orig_sqrt_ssa_name);
|
||||
gcc_assert (sqr_ssa_name);
|
||||
|
||||
gimple *new_stmt
|
||||
= gimple_build_assign (x, MULT_EXPR,
|
||||
orig_sqrt_ssa_name, sqr_ssa_name);
|
||||
gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
|
||||
update_stmt (stmt);
|
||||
}
|
||||
else if (delete_div)
|
||||
{
|
||||
/* Remove the original division. */
|
||||
gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
|
||||
gsi_remove (&gsi2, true);
|
||||
release_defs (stmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Look for floating-point divisions among DEF's uses, and try to
|
||||
replace them by multiplications with the reciprocal. Add
|
||||
|
|
@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun)
|
|||
&& (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
|
||||
&& FLOAT_TYPE_P (TREE_TYPE (def))
|
||||
&& TREE_CODE (def) == SSA_NAME)
|
||||
execute_cse_reciprocals_1 (&gsi, def);
|
||||
{
|
||||
if (flag_unsafe_math_optimizations
|
||||
&& is_gimple_assign (stmt)
|
||||
&& !stmt_can_throw_internal (stmt)
|
||||
&& gimple_assign_rhs_code (stmt) == RDIV_EXPR)
|
||||
optimize_recip_sqrt (&gsi, def);
|
||||
else
|
||||
execute_cse_reciprocals_1 (&gsi, def);
|
||||
}
|
||||
}
|
||||
|
||||
if (optimize_bb_for_size_p (bb))
|
||||
|
|
|
|||
Loading…
Reference in New Issue