timevar.def (TV_SCHED_FUSION): New time var.

* timevar.def (TV_SCHED_FUSION): New time var. * passes.def (pass_sched_fusion): New pass. * config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New. (extract_base_offset_in_addr, fusion_load_store): New. (arm_sched_fusion_priority): New. (arm_option_override): Disable scheduling fusion by default on non-armv7 processors or ldrd/strd isn't preferred. * sched-int.h (struct _haifa_insn_data): New field. (INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New. * sched-rgn.c (rest_of_handle_sched_fusion): New. (pass_data_sched_fusion, pass_sched_fusion): New. (make_pass_sched_fusion): New. * haifa-sched.c (sched_fusion): New. (insn_cost): Handle sched_fusion. (priority): Handle sched_fusion by calling target hook. (enum rfs_decision): New enum value. (rfs_str): New element for RFS_FUSION. (rank_for_schedule): Support sched_fusion. (schedule_insn, max_issue, prune_ready_list): Handle sched_fusion. (schedule_block, fix_tick_ready): Handle sched_fusion. * common.opt (flag_schedule_fusion): New. * tree-pass.h (make_pass_sched_fusion): New. * target.def (fusion_priority): New. * doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New. * doc/tm.texi: Regenerated. * doc/invoke.texi (-fschedule-fusion): New. testsuite: * gcc.target/arm/ldrd-strd-pair-1.c: New test. * gcc.target/arm/vfp-1.c: Improve scanning string. From-SVN: r217533
2014-11-14 02:32:38 +00:00 · 2014-11-14 02:32:38 +00:00 · b16abbcb85
parent 0fb3402f69
commit b16abbcb85
16 changed files with 497 additions and 10 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,32 @@
 2014-11-14  Bin Cheng  <bin.cheng@arm.com>
 	* timevar.def (TV_SCHED_FUSION): New time var.
 	* passes.def (pass_sched_fusion): New pass.
 	* config/arm/arm.c (TARGET_SCHED_FUSION_PRIORITY): New.
 	(extract_base_offset_in_addr, fusion_load_store): New.
 	(arm_sched_fusion_priority): New.
 	(arm_option_override): Disable scheduling fusion by default
 	on non-armv7 processors or ldrd/strd isn't preferred.
 	* sched-int.h (struct _haifa_insn_data): New field.
 	(INSN_FUSION_PRIORITY, FUSION_MAX_PRIORITY, sched_fusion): New.
 	* sched-rgn.c (rest_of_handle_sched_fusion): New.
 	(pass_data_sched_fusion, pass_sched_fusion): New.
 	(make_pass_sched_fusion): New.
 	* haifa-sched.c (sched_fusion): New.
 	(insn_cost): Handle sched_fusion.
 	(priority): Handle sched_fusion by calling target hook.
 	(enum rfs_decision): New enum value.
 	(rfs_str): New element for RFS_FUSION.
 	(rank_for_schedule): Support sched_fusion.
 	(schedule_insn, max_issue, prune_ready_list): Handle sched_fusion.
 	(schedule_block, fix_tick_ready): Handle sched_fusion.
 	* common.opt (flag_schedule_fusion): New.
 	* tree-pass.h (make_pass_sched_fusion): New.
 	* target.def (fusion_priority): New.
 	* doc/tm.texi.in (TARGET_SCHED_FUSION_PRIORITY): New.
 	* doc/tm.texi: Regenerated.
 	* doc/invoke.texi (-fschedule-fusion): New.
 2014-11-13  Rong Xu  <xur@google.com>
 	PR debug/63581
--- a/gcc/common.opt
+++ b/gcc/common.opt
@ -1848,6 +1848,10 @@ frename-registers
 Common Report Var(flag_rename_registers) Init(2) Optimization
 Perform a register renaming optimization pass
 fschedule-fusion
 Common Report Var(flag_schedule_fusion) Init(2) Optimization
 Perform a target dependent instruction fusion optimization pass
 freorder-blocks
 Common Report Var(flag_reorder_blocks) Optimization
 Reorder basic blocks to improve code placement
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@ -311,6 +311,8 @@ static unsigned arm_add_stmt_cost (void *data, int count,
 static void arm_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
 					 bool op0_preserve_value);
 static unsigned HOST_WIDE_INT arm_asan_shadow_offset (void);
 static void arm_sched_fusion_priority (rtx_insn *, int, int *, int*);
 /* Table of machine attributes.  */
 static const struct attribute_spec arm_attribute_table[] =
@ -708,6 +710,9 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
 #undef TARGET_SCHED_FUSION_PRIORITY
 #define TARGET_SCHED_FUSION_PRIORITY arm_sched_fusion_priority
 struct gcc_target targetm = TARGET_INITIALIZER;
 /* Obstack for minipool constant handling.  */
@ -3168,6 +3173,12 @@ arm_option_override (void)
  if (TARGET_THUMB2)
    inline_asm_unified = 1;
  /* Disable scheduling fusion by default if it's not armv7 processor
     or doesn't prefer ldrd/strd.  */
  if (flag_schedule_fusion == 2
      && (!arm_arch7 || !current_tune->prefer_ldrd_strd))
    flag_schedule_fusion = 0;
  /* Register global variables with the garbage collector.  */
  arm_add_gc_roots ();
 }
@ -32350,4 +32361,124 @@ arm_is_constant_pool_ref (rtx x)
 	  && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)));
 }
 /* If MEM is in the form of [base+offset], extract the two parts
   of address and set to BASE and OFFSET, otherwise return false
   after clearing BASE and OFFSET.  */
 static bool
 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
 {
  rtx addr;
  gcc_assert (MEM_P (mem));
  addr = XEXP (mem, 0);
  /* Strip off const from addresses like (const (addr)).  */
  if (GET_CODE (addr) == CONST)
    addr = XEXP (addr, 0);
  if (GET_CODE (addr) == REG)
    {
      *base = addr;
      *offset = const0_rtx;
      return true;
    }
  if (GET_CODE (addr) == PLUS
      && GET_CODE (XEXP (addr, 0)) == REG
      && CONST_INT_P (XEXP (addr, 1)))
    {
      *base = XEXP (addr, 0);
      *offset = XEXP (addr, 1);
      return true;
    }
  *base = NULL_RTX;
  *offset = NULL_RTX;
  return false;
 }
 /* If INSN is a load or store of address in the form of [base+offset],
   extract the two parts and set to BASE and OFFSET.  IS_LOAD is set
   to TRUE if it's a load.  Return TRUE if INSN is such an instruction,
   otherwise return FALSE.  */
 static bool
 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset, bool *is_load)
 {
  rtx x, dest, src;
  gcc_assert (INSN_P (insn));
  x = PATTERN (insn);
  if (GET_CODE (x) != SET)
    return false;
  src = SET_SRC (x);
  dest = SET_DEST (x);
  if (GET_CODE (src) == REG && GET_CODE (dest) == MEM)
    {
      *is_load = false;
      extract_base_offset_in_addr (dest, base, offset);
    }
  else if (GET_CODE (src) == MEM && GET_CODE (dest) == REG)
    {
      *is_load = true;
      extract_base_offset_in_addr (src, base, offset);
    }
  else
    return false;
  return (*base != NULL_RTX && *offset != NULL_RTX);
 }
 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
   Currently we only support to fuse ldr or str instructions, so FUSION_PRI
   and PRI are only calculated for these instructions.  For other instruction,
   FUSION_PRI and PRI are simply set to MAX_PRI.  In the future, other kind
   instruction fusion can be supported by returning different priorities.
   It's important that irrelevant instructions get the largest FUSION_PRI.  */
 static void
 arm_sched_fusion_priority (rtx_insn *insn, int max_pri,
 			   int *fusion_pri, int *pri)
 {
  int tmp, off_val;
  bool is_load;
  rtx base, offset;
  gcc_assert (INSN_P (insn));
  tmp = max_pri - 1;
  if (!fusion_load_store (insn, &base, &offset, &is_load))
    {
      *pri = tmp;
      *fusion_pri = tmp;
      return;
    }
  /* Load goes first.  */
  if (is_load)
    *fusion_pri = tmp - 1;
  else
    *fusion_pri = tmp - 2;
  tmp /= 2;
  /* INSN with smaller base register goes first.  */
  tmp -= ((REGNO (base) & 0xff) << 20);
  /* INSN with smaller offset goes first.  */
  off_val = (int)(INTVAL (offset));
  if (off_val >= 0)
    tmp -= (off_val & 0xfffff);
  else
    tmp += ((- off_val) & 0xfffff);
  *pri = tmp;
  return;
 }
 #include "gt-arm.h"
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@ -406,7 +406,7 @@ Objective-C and Objective-C++ Dialects}.
 -fprofile-correction -fprofile-dir=@var{path} -fprofile-generate @gol
 -fprofile-generate=@var{path} @gol
 -fprofile-use -fprofile-use=@var{path} -fprofile-values -fprofile-reorder-functions @gol
-freciprocal-math -free -frename-registers -freorder-blocks @gol
+-freciprocal-math -free -frename-registers -fschedule-fusion -freorder-blocks @gol
 -freorder-blocks-and-partition -freorder-functions @gol
 -frerun-cse-after-loop -freschedule-modulo-scheduled-loops @gol
 -frounding-math -fsched2-use-superblocks -fsched-pressure @gol
@ -9575,6 +9575,14 @@ a ``home register''.
 Enabled by default with @option{-funroll-loops} and @option{-fpeel-loops}.
@item -fschedule-fusion
@opindex fschedule-fusion
 Performs a target dependent pass over the instruction stream to schedule
 instructions of same type together because target machine can execute them
 more efficiently if they are adjacent to each other in the instruction flow.
 Enabled at levels @option{-O2}, @option{-O3}, @option{-Os}.
@item -ftracer
@opindex ftracer
 Perform tail duplication to enlarge superblock size.  This transformation
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@ -6771,6 +6771,76 @@ This hook is called by tree reassociator to determine a level of
 parallelism required in output calculations chain.
@end deftypefn
@deftypefn {Target Hook} void TARGET_SCHED_FUSION_PRIORITY (rtx_insn *@var{insn}, int @var{max_pri}, int *@var{fusion_pri}, int *@var{pri})
 This hook is called by scheduling fusion pass.  It calculates fusion
 priorities for each instruction passed in by parameter.  The priorities
 are returned via pointer parameters.
@var{insn} is the instruction whose priorities need to be calculated.
@var{max_pri} is the maximum priority can be returned in any cases.
@var{fusion_pri} is the pointer parameter through which @var{insn}'s
 fusion priority should be calculated and returned.
@var{pri} is the pointer parameter through which @var{insn}'s priority
 should be calculated and returned.
 Same @var{fusion_pri} should be returned for instructions which should
 be scheduled together.  Different @var{pri} should be returned for
 instructions with same @var{fusion_pri}.  @var{fusion_pri} is the major
 sort key, @var{pri} is the minor sort key.  All instructions will be
 scheduled according to the two priorities.  All priorities calculated
 should be between 0 (exclusive) and @var{max_pri} (inclusive).  To avoid
 false dependencies, @var{fusion_pri} of instructions which need to be
 scheduled together should be smaller than @var{fusion_pri} of irrelevant
 instructions.
 Given below example:
    ldr r10, [r1, 4]
    add r4, r4, r10
    ldr r15, [r2, 8]
    sub r5, r5, r15
    ldr r11, [r1, 0]
    add r4, r4, r11
    ldr r16, [r2, 12]
    sub r5, r5, r16
 On targets like ARM/AArch64, the two pairs of consecutive loads should be
 merged.  Since peephole2 pass can't help in this case unless consecutive
 loads are actually next to each other in instruction flow.  That's where
 this scheduling fusion pass works.  This hook calculates priority for each
 instruction based on its fustion type, like:
    ldr r10, [r1, 4]  ; fusion_pri=99,  pri=96   
    add r4, r4, r10   ; fusion_pri=100, pri=100  
    ldr r15, [r2, 8]  ; fusion_pri=98,  pri=92   
    sub r5, r5, r15   ; fusion_pri=100, pri=100  
    ldr r11, [r1, 0]  ; fusion_pri=99,  pri=100  
    add r4, r4, r11   ; fusion_pri=100, pri=100  
    ldr r16, [r2, 12] ; fusion_pri=98,  pri=88   
    sub r5, r5, r16   ; fusion_pri=100, pri=100  
 Scheduling fusion pass then sorts all ready to issue instructions according
 to the priorities.  As a result, instructions of same fusion type will be
 pushed together in instruction flow, like:
    ldr r11, [r1, 0]
    ldr r10, [r1, 4]
    ldr r15, [r2, 8]
    ldr r16, [r2, 12]
    add r4, r4, r10
    sub r5, r5, r15
    add r4, r4, r11
    sub r5, r5, r16
 Now peephole2 pass can simply merge the two pairs of loads.
 Since scheduling fusion pass relies on peephole2 to do real fusion
 work, it is only enabled by default when peephole2 is in effect.
 This is firstly introduced on ARM/AArch64 targets, please refer to
 the hook implementation for how different fusion types are supported.
@end deftypefn
@node Sections
@section Dividing the Output into Sections (Texts, Data, @dots{})
@c the above section title is WAY too long.  maybe cut the part between
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@ -4811,6 +4811,8 @@ them: try the first ones in this list first.
@hook TARGET_SCHED_REASSOCIATION_WIDTH
@hook TARGET_SCHED_FUSION_PRIORITY
@node Sections
@section Dividing the Output into Sections (Texts, Data, @dots{})
@c the above section title is WAY too long.  maybe cut the part between
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@ -1391,6 +1391,9 @@ insn_cost (rtx_insn *insn)
 {
  int cost;
  if (sched_fusion)
    return 0;
  if (sel_sched_p ())
    {
      if (recog_memoized (insn) < 0)
@ -1603,6 +1606,8 @@ dep_list_size (rtx insn, sd_list_types_def list)
  return nodbgcount;
 }
 bool sched_fusion;
 /* Compute the priority number for INSN.  */
 static int
 priority (rtx_insn *insn)
@ -1617,7 +1622,15 @@ priority (rtx_insn *insn)
    {
      int this_priority = -1;
-      if (dep_list_size (insn, SD_LIST_FORW) == 0)
+      if (sched_fusion)
 	{
 	  int this_fusion_priority;
 	  targetm.sched.fusion_priority (insn, FUSION_MAX_PRIORITY,
 					 &this_fusion_priority, &this_priority);
 	  INSN_FUSION_PRIORITY (insn) = this_fusion_priority;
 	}
      else if (dep_list_size (insn, SD_LIST_FORW) == 0)
 	/* ??? We should set INSN_PRIORITY to insn_cost when and insn has
 	   some forward deps but all of them are ignored by
 	   contributes_to_priority hook.  At the moment we set priority of
@ -2548,7 +2561,7 @@ enum rfs_decision {
  RFS_SCHED_GROUP, RFS_PRESSURE_DELAY, RFS_PRESSURE_TICK,
  RFS_FEEDS_BACKTRACK_INSN, RFS_PRIORITY, RFS_SPECULATION,
  RFS_SCHED_RANK, RFS_LAST_INSN, RFS_PRESSURE_INDEX,
-  RFS_DEP_COUNT, RFS_TIE, RFS_N };
+  RFS_DEP_COUNT, RFS_TIE, RFS_FUSION, RFS_N };
 /* Corresponding strings for print outs.  */
 static const char *rfs_str[RFS_N] = {
@ -2556,7 +2569,7 @@ static const char *rfs_str[RFS_N] = {
  "RFS_SCHED_GROUP", "RFS_PRESSURE_DELAY", "RFS_PRESSURE_TICK",
  "RFS_FEEDS_BACKTRACK_INSN", "RFS_PRIORITY", "RFS_SPECULATION",
  "RFS_SCHED_RANK", "RFS_LAST_INSN", "RFS_PRESSURE_INDEX",
-  "RFS_DEP_COUNT", "RFS_TIE" };
+  "RFS_DEP_COUNT", "RFS_TIE", "RFS_FUSION" };
 /* Statistical breakdown of rank_for_schedule decisions.  */
 typedef struct { unsigned stats[RFS_N]; } rank_for_schedule_stats_t;
@ -2627,6 +2640,55 @@ rank_for_schedule (const void *x, const void *y)
  /* Make sure that priority of TMP and TMP2 are initialized.  */
  gcc_assert (INSN_PRIORITY_KNOWN (tmp) && INSN_PRIORITY_KNOWN (tmp2));
  if (sched_fusion)
    {
      /* The instruction that has the same fusion priority as the last
 	 instruction is the instruction we picked next.  If that is not
 	 the case, we sort ready list firstly by fusion priority, then
 	 by priority, and at last by INSN_LUID.  */
      int a = INSN_FUSION_PRIORITY (tmp);
      int b = INSN_FUSION_PRIORITY (tmp2);
      int last = -1;
      if (last_nondebug_scheduled_insn
 	  && !NOTE_P (last_nondebug_scheduled_insn)
 	  && BLOCK_FOR_INSN (tmp)
 	       == BLOCK_FOR_INSN (last_nondebug_scheduled_insn))
 	last = INSN_FUSION_PRIORITY (last_nondebug_scheduled_insn);
      if (a != last && b != last)
 	{
 	  if (a == b)
 	    {
 	      a = INSN_PRIORITY (tmp);
 	      b = INSN_PRIORITY (tmp2);
 	    }
 	  if (a != b)
 	    return rfs_result (RFS_FUSION, b - a, tmp, tmp2);
 	  else
 	    return rfs_result (RFS_FUSION,
 			       INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
 	}
      else if (a == b)
 	{
 	  gcc_assert (last_nondebug_scheduled_insn
 		      && !NOTE_P (last_nondebug_scheduled_insn));
 	  last = INSN_PRIORITY (last_nondebug_scheduled_insn);
 	  a = abs (INSN_PRIORITY (tmp) - last);
 	  b = abs (INSN_PRIORITY (tmp2) - last);
 	  if (a != b)
 	    return rfs_result (RFS_FUSION, a - b, tmp, tmp2);
 	  else
 	    return rfs_result (RFS_FUSION,
 			       INSN_LUID (tmp) - INSN_LUID (tmp2), tmp, tmp2);
 	}
      else if (a == last)
 	return rfs_result (RFS_FUSION, -1, tmp, tmp2);
      else
 	return rfs_result (RFS_FUSION, 1, tmp, tmp2);
    }
  if (sched_pressure != SCHED_PRESSURE_NONE)
    {
      /* Prefer insn whose scheduling results in the smallest register
@ -4007,8 +4069,8 @@ schedule_insn (rtx_insn *insn)
  gcc_assert (INSN_TICK (insn) >= MIN_TICK);
  if (INSN_TICK (insn) > clock_var)
    /* INSN has been prematurely moved from the queue to the ready list.
-       This is possible only if following flag is set.  */
+       This is possible only if following flags are set.  */
-    gcc_assert (flag_sched_stalled_insns);
+    gcc_assert (flag_sched_stalled_insns || sched_fusion);
  /* ??? Probably, if INSN is scheduled prematurely, we should leave
     INSN_TICK untouched.  This is a machine-dependent issue, actually.  */
@ -5500,6 +5562,9 @@ max_issue (struct ready_list *ready, int privileged_n, state_t state,
  struct choice_entry *top;
  rtx_insn *insn;
  if (sched_fusion)
    return 0;
  n_ready = ready->n_ready;
  gcc_assert (dfa_lookahead >= 1 && privileged_n >= 0
 	      && privileged_n <= n_ready);
@ -5848,6 +5913,9 @@ prune_ready_list (state_t temp_state, bool first_cycle_insn_p,
  bool sched_group_found = false;
  int min_cost_group = 1;
  if (sched_fusion)
    return;
  for (i = 0; i < ready.n_ready; i++)
    {
      rtx_insn *insn = ready_element (&ready, i);
@ -6059,7 +6127,7 @@ schedule_block (basic_block *target_bb, state_t init_state)
  rtx_insn *tail = PREV_INSN (next_tail);
  if ((current_sched_info->flags & DONT_BREAK_DEPENDENCIES) == 0
-      && sched_pressure != SCHED_PRESSURE_MODEL)
+      && sched_pressure != SCHED_PRESSURE_MODEL && !sched_fusion)
    find_modifiable_mems (head, tail);
  /* We used to have code to avoid getting parameters moved from hard
@ -6455,7 +6523,7 @@ schedule_block (basic_block *target_bb, state_t init_state)
 	    {
 	      memcpy (temp_state, curr_state, dfa_state_size);
 	      cost = state_transition (curr_state, insn);
-	      if (sched_pressure != SCHED_PRESSURE_WEIGHTED)
+	      if (sched_pressure != SCHED_PRESSURE_WEIGHTED && !sched_fusion)
 		gcc_assert (cost < 0);
 	      if (memcmp (temp_state, curr_state, dfa_state_size) != 0)
 		cycle_issued_insns++;
@ -7288,7 +7356,7 @@ fix_tick_ready (rtx_insn *next)
  INSN_TICK (next) = tick;
  delay = tick - clock_var;
-  if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE)
+  if (delay <= 0 || sched_pressure != SCHED_PRESSURE_NONE || sched_fusion)
    delay = QUEUE_READY;
  change_queue_index (next, delay);
--- a/gcc/passes.def
+++ b/gcc/passes.def
@ -419,6 +419,7 @@ along with GCC; see the file COPYING3.  If not see
 	  NEXT_PASS (pass_stack_adjustments);
 	  NEXT_PASS (pass_jump2);
 	  NEXT_PASS (pass_duplicate_computed_gotos);
 	  NEXT_PASS (pass_sched_fusion);
 	  NEXT_PASS (pass_peephole2);
 	  NEXT_PASS (pass_if_after_reload);
 	  NEXT_PASS (pass_regrename);
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@ -805,6 +805,9 @@ struct _haifa_insn_data
  /* A priority for each insn.  */
  int priority;
  /* The fusion priority for each insn.  */
  int fusion_priority;
  /* The minimum clock tick at which the insn becomes ready.  This is
     used to note timing constraints for the insns in the pending list.  */
  int tick;
@ -903,6 +906,7 @@ extern vec<haifa_insn_data_def> h_i_d;
 /* Accessor macros for h_i_d.  There are more in haifa-sched.c and
   sched-rgn.c.  */
 #define INSN_PRIORITY(INSN) (HID (INSN)->priority)
 #define INSN_FUSION_PRIORITY(INSN) (HID (INSN)->fusion_priority)
 #define INSN_REG_PRESSURE(INSN) (HID (INSN)->reg_pressure)
 #define INSN_MAX_REG_PRESSURE(INSN) (HID (INSN)->max_reg_pressure)
 #define INSN_REG_USE_LIST(INSN) (HID (INSN)->reg_use_list)
@ -1620,6 +1624,10 @@ extern void sd_copy_back_deps (rtx_insn *, rtx_insn *, bool);
 extern void sd_delete_dep (sd_iterator_def);
 extern void sd_debug_lists (rtx, sd_list_types_def);
 /* Macros and declarations for scheduling fusion.  */
 #define FUSION_MAX_PRIORITY (INT_MAX)
 extern bool sched_fusion;
 #endif /* INSN_SCHEDULING */
 #endif /* GCC_SCHED_INT_H */
--- a/gcc/sched-rgn.c
+++ b/gcc/sched-rgn.c
@ -3658,6 +3658,17 @@ rest_of_handle_sched2 (void)
  return 0;
 }
 static unsigned int
 rest_of_handle_sched_fusion (void)
 {
 #ifdef INSN_SCHEDULING
  sched_fusion = true;
  schedule_insns ();
  sched_fusion = false;
 #endif
  return 0;
 }
 namespace {
 const pass_data pass_data_live_range_shrinkage =
@ -3800,3 +3811,55 @@ make_pass_sched2 (gcc::context *ctxt)
 {
  return new pass_sched2 (ctxt);
 }
 namespace {
 const pass_data pass_data_sched_fusion =
 {
  RTL_PASS, /* type */
  "sched_fusion", /* name */
  OPTGROUP_NONE, /* optinfo_flags */
  TV_SCHED_FUSION, /* tv_id */
  0, /* properties_required */
  0, /* properties_provided */
  0, /* properties_destroyed */
  0, /* todo_flags_start */
  TODO_df_finish, /* todo_flags_finish */
 };
 class pass_sched_fusion : public rtl_opt_pass
 {
 public:
  pass_sched_fusion (gcc::context *ctxt)
    : rtl_opt_pass (pass_data_sched_fusion, ctxt)
  {}
  /* opt_pass methods: */
  virtual bool gate (function *);
  virtual unsigned int execute (function *)
    {
      return rest_of_handle_sched_fusion ();
    }
 }; // class pass_sched2
 bool
 pass_sched_fusion::gate (function *)
 {
 #ifdef INSN_SCHEDULING
  /* Scheduling fusion relies on peephole2 to do real fusion work,
     so only enable it if peephole2 is in effect.  */
  return (optimize > 0 && flag_peephole2
    && flag_schedule_fusion && targetm.sched.fusion_priority != NULL);
 #else
  return 0;
 #endif
 }
 } // anon namespace
 rtl_opt_pass *
 make_pass_sched_fusion (gcc::context *ctxt)
 {
  return new pass_sched_fusion (ctxt);
 }
--- a/gcc/target.def
+++ b/gcc/target.def
@ -1526,6 +1526,79 @@ parallelism required in output calculations chain.",
 int, (unsigned int opc, machine_mode mode),
 hook_int_uint_mode_1)
 /* The following member value is a function that returns priority for
   fusion of each instruction via pointer parameters.  */
 DEFHOOK
 (fusion_priority,
 "This hook is called by scheduling fusion pass.  It calculates fusion\n\
 priorities for each instruction passed in by parameter.  The priorities\n\
 are returned via pointer parameters.\n\
 \n\
@var{insn} is the instruction whose priorities need to be calculated.\n\
@var{max_pri} is the maximum priority can be returned in any cases.\n\
@var{fusion_pri} is the pointer parameter through which @var{insn}'s\n\
 fusion priority should be calculated and returned.\n\
@var{pri} is the pointer parameter through which @var{insn}'s priority\n\
 should be calculated and returned.\n\
 \n\
 Same @var{fusion_pri} should be returned for instructions which should\n\
 be scheduled together.  Different @var{pri} should be returned for\n\
 instructions with same @var{fusion_pri}.  @var{fusion_pri} is the major\n\
 sort key, @var{pri} is the minor sort key.  All instructions will be\n\
 scheduled according to the two priorities.  All priorities calculated\n\
 should be between 0 (exclusive) and @var{max_pri} (inclusive).  To avoid\n\
 false dependencies, @var{fusion_pri} of instructions which need to be\n\
 scheduled together should be smaller than @var{fusion_pri} of irrelevant\n\
 instructions.\n\
 \n\
 Given below example:\n\
 \n\
    ldr r10, [r1, 4]\n\
    add r4, r4, r10\n\
    ldr r15, [r2, 8]\n\
    sub r5, r5, r15\n\
    ldr r11, [r1, 0]\n\
    add r4, r4, r11\n\
    ldr r16, [r2, 12]\n\
    sub r5, r5, r16\n\
 \n\
 On targets like ARM/AArch64, the two pairs of consecutive loads should be\n\
 merged.  Since peephole2 pass can't help in this case unless consecutive\n\
 loads are actually next to each other in instruction flow.  That's where\n\
 this scheduling fusion pass works.  This hook calculates priority for each\n\
 instruction based on its fustion type, like:\n\
 \n\
    ldr r10, [r1, 4]  ; fusion_pri=99,  pri=96   \n\
    add r4, r4, r10   ; fusion_pri=100, pri=100  \n\
    ldr r15, [r2, 8]  ; fusion_pri=98,  pri=92   \n\
    sub r5, r5, r15   ; fusion_pri=100, pri=100  \n\
    ldr r11, [r1, 0]  ; fusion_pri=99,  pri=100  \n\
    add r4, r4, r11   ; fusion_pri=100, pri=100  \n\
    ldr r16, [r2, 12] ; fusion_pri=98,  pri=88   \n\
    sub r5, r5, r16   ; fusion_pri=100, pri=100  \n\
 \n\
 Scheduling fusion pass then sorts all ready to issue instructions according\n\
 to the priorities.  As a result, instructions of same fusion type will be\n\
 pushed together in instruction flow, like:\n\
 \n\
    ldr r11, [r1, 0]\n\
    ldr r10, [r1, 4]\n\
    ldr r15, [r2, 8]\n\
    ldr r16, [r2, 12]\n\
    add r4, r4, r10\n\
    sub r5, r5, r15\n\
    add r4, r4, r11\n\
    sub r5, r5, r16\n\
 \n\
 Now peephole2 pass can simply merge the two pairs of loads.\n\
 \n\
 Since scheduling fusion pass relies on peephole2 to do real fusion\n\
 work, it is only enabled by default when peephole2 is in effect.\n\
 \n\
 This is firstly introduced on ARM/AArch64 targets, please refer to\n\
 the hook implementation for how different fusion types are supported.",
 void, (rtx_insn *insn, int max_pri, int *fusion_pri, int *pri), NULL)
 HOOK_VECTOR_END (sched)
 /* Functions relating to OpenMP and Cilk Plus SIMD clones.  */
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@ -1,3 +1,8 @@
 2014-11-14  Bin Cheng  <bin.cheng@arm.com>
 	* gcc.target/arm/ldrd-strd-pair-1.c: New test.
 	* gcc.target/arm/vfp-1.c: Improve scanning string.
 2014-11-13  Rong Xu  <xur@google.com>
 	PR debug/63581
--- a/gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c
+++ b/gcc/testsuite/gcc.target/arm/ldrd-strd-pair-1.c
@ -0,0 +1,23 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_prefer_ldrd_strd } */
 /* { dg-options "-O2 -mthumb" } */
 struct
 {
  int x;
  int y;
  char c;
  int d;
 }a;
 int foo(int x, int y)
 {
  int c;
  a.x = x;
  c = a.x;
  a.d = c;
  a.y = y;
  return 0;
 }
 /* { dg-final { scan-assembler "strd\t" { target { arm_thumb2_ok } } } } */
--- a/gcc/testsuite/gcc.target/arm/vfp-1.c
+++ b/gcc/testsuite/gcc.target/arm/vfp-1.c
@ -126,7 +126,7 @@ void test_convert () {
 }
 void test_ldst (float f[], double d[]) {
-  /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #1020\\\]" } } */
+  /* { dg-final { scan-assembler "vldr.32.+ \\\[r0, #-?\[0-9\]+\\\]" } } */
  /* { dg-final { scan-assembler "vldr.32.+ \\\[r\[0-9\], #-1020\\\]" { target { arm32 && { ! arm_thumb2_ok } } } } } */
  /* { dg-final { scan-assembler "add.+ r0, #1024" } } */
  /* { dg-final { scan-assembler "vstr.32.+ \\\[r\[0-9\]\\\]\n" } } */
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@ -247,6 +247,7 @@ DEFTIMEVAR (TV_IFCVT2		     , "if-conversion 2")
 DEFTIMEVAR (TV_COMBINE_STACK_ADJUST  , "combine stack adjustments")
 DEFTIMEVAR (TV_PEEPHOLE2             , "peephole 2")
 DEFTIMEVAR (TV_RENAME_REGISTERS      , "rename registers")
 DEFTIMEVAR (TV_SCHED_FUSION          , "scheduling fusion")
 DEFTIMEVAR (TV_CPROP_REGISTERS       , "hard reg cprop")
 DEFTIMEVAR (TV_SCHED2                , "scheduling 2")
 DEFTIMEVAR (TV_MACH_DEP              , "machine dep reorg")
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@ -552,6 +552,7 @@ extern rtl_opt_pass *make_pass_branch_target_load_optimize1 (gcc::context
 extern rtl_opt_pass *make_pass_thread_prologue_and_epilogue (gcc::context
 							     *ctxt);
 extern rtl_opt_pass *make_pass_stack_adjustments (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_sched_fusion (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_peephole2 (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_if_after_reload (gcc::context *ctxt);
 extern rtl_opt_pass *make_pass_regrename (gcc::context *ctxt);