Compare commits

...

2 Commits

Author SHA1 Message Date
David Faust 239535e9b0 bpf: fix memset miscompilation with larger stores [PR122139]
The BPF backend expansion of setmem was broken, because it could elect
to use stores of HI, SI or DI modes based on the destination alignment
when the value was QI, but fail to duplicate the byte value across to
those larger sizes.  This resulted in not all bytes of the destination
actually being set to the desired value.

Fix bpf_expand_setmem to ensure the desired byte value is really
duplicated as necessary, whether it is constant or a (sub)reg:QI.

	PR target/122139

gcc/

	* config/bpf/bpf.cc (bpf_expand_setmem): Duplicate byte value
	across to new mode when using larger modes for store.

gcc/testsuite/

	* gcc.target/bpf/memset-3.c: New.
	* gcc.target/bpf/memset-4.c: New.
2025-10-17 08:40:46 -07:00
Tamar Christina d1965b1fd8 AArch64: Extend intrinsics framework to account for merging predications without gp [PR121604]
In PR121604 the problem was noted that currently the SVE intrinsics
infrastructure assumes that for any predicated operation that the GP is at the
first argument position which has a svbool_t or for a unary merging operation
that it's in the second position.

However you have intrinsics like fmov_lane which have an svbool_t but it's not
a GP but instead it's the inactive lanes.

You also have instructions like BRKB which work only on predicates so it
incorrectly determines the first operand to be the GP, while that's also the
inactive lanes.

However during apply_predication we do have the information about where the GP
is.  This patch re-organizes the code to record this information into the
function_instance such that folders have access to this information.

For functions that are outliers like pmov_lane we can now override the
availability of the intrinsics having a GP.

gcc/ChangeLog:

	PR target/121604
	* config/aarch64/aarch64-sve-builtins-shapes.cc (apply_predication):
	Store gp_index.
	(struct pmov_to_vector_lane_def): Mark instruction as has no GP.
	* config/aarch64/aarch64-sve-builtins.h (function_instance::gp_value,
	function_instance::inactive_values, function_instance::gp_index,
	function_shape::has_gp_argument_p): New.
	* config/aarch64/aarch64-sve-builtins.cc (gimple_folder::fold_pfalse):
	Simplify code and use GP helpers.

gcc/testsuite/ChangeLog:

	PR target/121604
	* gcc.target/aarch64/sve/pr121604_brk.c: New test.
	* gcc.target/aarch64/sve2/pr121604_pmov.c: New test.

Co-authored-by: Jennifer Schmitz <jschmitz@nvidia.com>
2025-10-17 15:43:10 +01:00
8 changed files with 258 additions and 18 deletions

View File

@ -21,7 +21,10 @@
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "basic-block.h"
#include "tree.h"
#include "function.h"
#include "gimple.h"
#include "rtl.h"
#include "tm_p.h"
#include "memmodel.h"
@ -68,25 +71,38 @@ za_group_is_pure_overload (const function_group_info &group)
types in ARGUMENT_TYPES. RETURN_TYPE is the type returned by the
function. */
static void
apply_predication (const function_instance &instance, tree return_type,
apply_predication (function_instance &instance, tree return_type,
vec<tree> &argument_types)
{
/* Initially mark the function as not being predicated. */
instance.gp_index = -1;
/* There are currently no SME ZA instructions that have both merging and
unpredicated forms, so for simplicity, the predicates are always included
in the original format string. */
if (instance.pred != PRED_none && instance.pred != PRED_za_m)
{
argument_types.quick_insert (0, instance.gp_type ());
instance.gp_index = 0;
/* For unary merge operations, the first argument is a vector with
the same type as the result. For unary_convert_narrowt it also
provides the "bottom" half of active elements, and is present
for all types of predication. */
auto nargs = argument_types.length () - 1;
if (instance.shape->has_merge_argument_p (instance, nargs))
{
argument_types.quick_insert (0, return_type);
instance.gp_index = 1;
}
}
/* In this case the predicate type we added above is a non-governing
predicate operand (and there is no GP), so update the gp_index value
accordingly. */
if (!instance.shape->has_gp_argument_p (instance))
instance.gp_index = -1;
}
/* Parse and move past an element type in FORMAT and return it as a type
suffix. The format is:
@ -3332,6 +3348,14 @@ struct pmov_to_vector_lane_def : public overloaded_base<0>
but it doesn't currently have the necessary information. */
return c.require_immediate_range (1, 1, bytes - 1);
}
/* This function has a predicate argument, and is a merging instruction, but
the predicate is not a GP. */
bool
has_gp_argument_p (const function_instance &) const override
{
return false;
}
};
SHAPE (pmov_to_vector_lane)

View File

@ -3632,24 +3632,22 @@ gimple_folder::redirect_pred_x ()
gimple *
gimple_folder::fold_pfalse ()
{
if (pred == PRED_none)
tree gp = gp_value (call);
/* If there isn't a GP then we can't do any folding as the instruction isn't
predicated. */
if (!gp)
return nullptr;
tree arg0 = gimple_call_arg (call, 0);
if (pred == PRED_m)
{
/* Unary function shapes with _m predication are folded to the
inactive vector (arg0), while other function shapes are folded
to op1 (arg1). */
tree arg1 = gimple_call_arg (call, 1);
if (is_pfalse (arg1))
return fold_call_to (arg0);
if (is_pfalse (arg0))
return fold_call_to (arg1);
tree val = inactive_values (call);
if (is_pfalse (gp))
return fold_call_to (val);
return nullptr;
}
if ((pred == PRED_x || pred == PRED_z) && is_pfalse (arg0))
if ((pred == PRED_x || pred == PRED_z) && is_pfalse (gp))
return fold_call_to (build_zero_cst (TREE_TYPE (lhs)));
if (pred == PRED_implicit && is_pfalse (arg0))
if (pred == PRED_implicit && is_pfalse (gp))
{
unsigned int flags = call_properties ();
/* Folding to lhs = {0, ...} is not appropriate for intrinsics with

View File

@ -403,6 +403,8 @@ public:
bool could_trap_p () const;
vector_type_index gp_type_index () const;
tree gp_value (gcall *) const;
tree inactive_values (gcall *) const;
tree gp_type () const;
unsigned int vectors_per_tuple () const;
@ -436,6 +438,7 @@ public:
group_suffix_index group_suffix_id;
predication_index pred;
fpm_mode_index fpm_mode;
int gp_index;
};
class registered_function;
@ -801,6 +804,8 @@ public:
virtual bool has_merge_argument_p (const function_instance &,
unsigned int) const;
virtual bool has_gp_argument_p (const function_instance &) const;
virtual bool explicit_type_suffix_p (unsigned int) const = 0;
/* True if the group suffix is present in overloaded names.
@ -949,6 +954,33 @@ function_instance::gp_type () const
return acle_vector_types[0][gp_type_index ()];
}
/* Return the tree value that should be used as the governing predicate of
this function. If none then return NULL_TREE. */
inline tree
function_instance::gp_value (gcall *call) const
{
if (gp_index < 0)
return NULL_TREE;
return gimple_call_arg (call, gp_index);
}
/* Return the tree value that should be used for the inactive lanes should this
function be a predicated function with a gp. Otherwise return NULL_TREE. */
inline tree
function_instance::inactive_values (gcall *call) const
{
if (gp_index < 0)
return NULL_TREE;
/* Function is unary with m predicate. */
if (gp_index == 1)
return gimple_call_arg (call, 0);
/* Else the inactive values are the next element. */
return gimple_call_arg (call, 1);
}
/* If the function operates on tuples of vectors, return the number
of vectors in the tuples, otherwise return 1. */
inline unsigned int
@ -1123,6 +1155,14 @@ function_shape::has_merge_argument_p (const function_instance &instance,
return nargs == 1 && instance.pred == PRED_m;
}
/* Return true if INSTANCE has an predicate argument that can be used as the global
predicate. */
inline bool
function_shape::has_gp_argument_p (const function_instance &instance) const
{
return instance.pred != PRED_none;
}
/* Return the mode of the result of a call. */
inline machine_mode
function_expander::result_mode () const

View File

@ -1427,25 +1427,82 @@ bpf_expand_setmem (rtx *operands)
unsigned inc = GET_MODE_SIZE (mode);
unsigned offset = 0;
/* If val is a constant, then build a new constant value duplicating
the byte across to the size of stores we might do.
e.g. if val is 0xab and we can store in 4-byte chunks, build
0xabababab and use that to do the memset.
If val is not a constant, then by constraint it is a QImode register
and we similarly duplicate the byte across. */
rtx src;
if (CONST_INT_P (val))
{
unsigned HOST_WIDE_INT tmp = UINTVAL (val) & 0xff;
/* Need src in the proper mode. */
switch (mode)
{
case DImode:
src = gen_rtx_CONST_INT (DImode, tmp * 0x0101010101010101);
break;
case SImode:
src = gen_rtx_CONST_INT (SImode, tmp * 0x01010101);
break;
case HImode:
src = gen_rtx_CONST_INT (HImode, tmp * 0x0101);
break;
default:
src = val;
break;
}
}
else
{
/* VAL is a subreg:QI (reg:DI N).
Copy that byte to fill the whole register. */
src = gen_reg_rtx (mode);
emit_move_insn (src, gen_rtx_ZERO_EXTEND (mode, val));
/* We can fill the whole register with copies of the byte by multiplying
by 0x010101...
For DImode this requires a tmp reg with lldw, but only if we will
actually do nonzero iterations of stxdw. */
if (mode < DImode || iters == 0)
emit_move_insn (src, gen_rtx_MULT (mode, src, GEN_INT (0x01010101)));
else
{
rtx tmp = gen_reg_rtx (mode);
emit_move_insn (tmp, GEN_INT (0x0101010101010101));
emit_move_insn (src, gen_rtx_MULT (mode, src, tmp));
}
}
for (unsigned int i = 0; i < iters; i++)
{
emit_move_insn (adjust_address (dst, mode, offset), val);
emit_move_insn (adjust_address (dst, mode, offset), src);
offset += inc;
}
if (remainder & 4)
{
emit_move_insn (adjust_address (dst, SImode, offset), val);
emit_move_insn (adjust_address (dst, SImode, offset),
REG_P (src)
? simplify_gen_subreg (SImode, src, mode, 0)
: src);
offset += 4;
remainder -= 4;
}
if (remainder & 2)
{
emit_move_insn (adjust_address (dst, HImode, offset), val);
emit_move_insn (adjust_address (dst, HImode, offset),
REG_P (src)
? simplify_gen_subreg (HImode, src, mode, 0)
: src);
offset += 2;
remainder -= 2;
}
if (remainder & 1)
emit_move_insn (adjust_address (dst, QImode, offset), val);
emit_move_insn (adjust_address (dst, QImode, offset),
REG_P (src)
? simplify_gen_subreg (QImode, src, mode, 0)
: src);
return true;
}

View File

@ -0,0 +1,25 @@
/* { dg-do compile } */
/* { dg-options "-O2" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <arm_sve.h>
/*
** foo:
** ptrue p0\.b, all
** brkb p0\.b, p0/z, p0\.b
** ret
*/
svbool_t foo () {
return svbrkb_b_m (svpfalse (), svptrue_b8 (), svptrue_b8 ());
}
/*
** bar:
** ptrue p0\.b, all
** brka p0\.b, p0/z, p0\.b
** ret
*/
svbool_t bar () {
return svbrka_b_m (svpfalse (), svptrue_b8 (), svptrue_b8 ());
}

View File

@ -0,0 +1,16 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=armv8.2-a+sve2p1" } */
/* { dg-final { check-function-bodies "**" "" "" } } */
#include <arm_sve.h>
/*
** f:
** pfalse p([0-7]+)\.b
** mov z0\.b, #-1
** pmov z0\[1\], p\1\.d
** ret
*/
svuint64_t f () {
return svpmov_lane_u64_m (svdup_u64 (~0UL), svpfalse (), 1);
}

View File

@ -0,0 +1,56 @@
/* Test that inline memset expansion properly duplicates the byte value
across the bytes to fill. PR target/122139. */
/* { dg-do compile } */
/* { dg-options "-O1 -masm=normal" } */
#define SIZE 63
unsigned char cdata[SIZE];
unsigned short sdata[SIZE / 2 + 1];
unsigned int idata[SIZE / 4 + 1];
unsigned long ldata[SIZE / 8 + 1];
void
a (void)
{
__builtin_memset (cdata, 0x54, SIZE);
}
/* 0x54=84 */
/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,84" 63 } } */
void
b (void)
{
__builtin_memset (sdata, 0x7a, SIZE);
}
/* 0x7a=122, 0x7a7a=31354 */
/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,31354" 31 } } */
/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,122" 1 } } */
void
c (void)
{
__builtin_memset (idata, 0x23, SIZE);
}
/* 0x23=35, 0x2323=8995, 0x23232323=589505315 */
/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,589505315" 15 } } */
/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,8995" 1 } } */
/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,35" 1 } } */
void
d (void)
{
__builtin_memset (ldata, 0xcb, SIZE);
}
/* 0xcbcbcbcb_cbcbcbcb = -3761688987579986997,
0xcbcbcbcb = -875836469
0xcbcb = -13365
0xcb = -53 */
/* { dg-final { scan-assembler-times "lddw\t%r.,-3761688987579986997"}} */
/* { dg-final { scan-assembler-times "stxdw" 7 } } */
/* { dg-final { scan-assembler-times "\[\t \]stw\[^\r\n\]+,-875836469" 1 } } */
/* { dg-final { scan-assembler-times "\[\t \]sth\[^\r\n\]+,-13365" 1 } } */
/* { dg-final { scan-assembler-times "\[\t \]stb\[^\r\n\]+,-53" 1 } } */

View File

@ -0,0 +1,24 @@
/* Test that inline memset expansion properly duplicates the byte value
across the bytes to fill for non-const value. PR target/122139. */
/* { dg-do compile } */
/* { dg-options "-O1 -masm=normal" } */
#define SIZE 63
unsigned char cdata[SIZE];
unsigned short sdata[SIZE / 2 + 1];
unsigned int idata[SIZE / 4 + 1];
unsigned long ldata[SIZE / 8 + 1];
void
c (unsigned char byte)
{
__builtin_memset (idata, byte, SIZE);
}
/* Hard to verify for non-const value. Look for the mul by 0x01010101
and the proper number of stores... */
/* { dg-final { scan-assembler "mul32\[\t \]%r.,16843009" } } */
/* { dg-final { scan-assembler-times "stxw" 15 } } */
/* { dg-final { scan-assembler-times "stxh" 1 } } */
/* { dg-final { scan-assembler-times "stxb" 1 } } */