Vectorize fast path of _cpp_clean_line.

* configure.ac (AC_C_BIGENDIAN, AC_TYPE_UINTPTR_T): New tests. (ssize_t): Check via AC_TYPE_SSIZE_T instead of AC_CHECK_TYPE. (ptrdiff_t): Check via AC_CHECK_TYPE. * config.in, configure: Rebuild. * system.h: Include stdint.h, if available. * lex.c (WORDS_BIGENDIAN): Provide default. (acc_char_mask_misalign, acc_char_replicate, acc_char_cmp, acc_char_index, search_line_acc_char, repl_chars, search_line_mmx, search_line_sse2, search_line_sse42, init_vectorized_lexer, search_line_fast): New. (_cpp_clean_line): Use search_line_fast. Restructure the fast loop to make it clear when we're leaving the loop. Stay in the fast loop for non-trigraph '?'. Co-Authored-By: Andi Kleen <ak@linux.intel.com> Co-Authored-By: David S. Miller <davem@davemloft.net> From-SVN: r163446
2010-08-21 12:05:40 -07:00 · 2010-08-21 12:05:40 -07:00 · 246a2fcb5e
parent 1d0134b3cc
commit 246a2fcb5e
6 changed files with 953 additions and 99 deletions
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@ -1,3 +1,21 @@
 2010-08-21  Richard Henderson  <rth@redhat.com>
 	    Andi Kleen <ak@linux.intel.com>
 	    David S. Miller  <davem@davemloft.net>
 	* configure.ac (AC_C_BIGENDIAN, AC_TYPE_UINTPTR_T): New tests.
 	(ssize_t): Check via AC_TYPE_SSIZE_T instead of AC_CHECK_TYPE.
 	(ptrdiff_t): Check via AC_CHECK_TYPE.
 	* config.in, configure: Rebuild.
 	* system.h: Include stdint.h, if available.
 	* lex.c (WORDS_BIGENDIAN): Provide default.
 	(acc_char_mask_misalign, acc_char_replicate, acc_char_cmp,
 	acc_char_index, search_line_acc_char, repl_chars, search_line_mmx,
 	search_line_sse2, search_line_sse42, init_vectorized_lexer,
 	search_line_fast): New.
 	(_cpp_clean_line): Use search_line_fast.  Restructure the fast
 	loop to make it clear when we're leaving the loop.  Stay in the
 	fast loop for non-trigraph '?'.
 2010-06-11  Jakub Jelinek  <jakub@redhat.com>
 	* include/cpplib.h (struct cpp_callbacks): Add user_builtin_macro
--- a/libcpp/config.in
+++ b/libcpp/config.in
@ -1,5 +1,8 @@
 /* config.in.  Generated from configure.ac by autoheader.  */
 /* Define if building universal (internal helper macro) */
 #undef AC_APPLE_UNIVERSAL_BUILD
 /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
   systems. This function is required for `alloca.c' support on those systems.
   */
@ -209,6 +212,9 @@
 /* Define if <sys/types.h> defines \`uchar'. */
 #undef HAVE_UCHAR
 /* Define to 1 if the system has the type `uintptr_t'. */
 #undef HAVE_UINTPTR_T
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H
@ -266,6 +272,18 @@
 /* Define to 1 if your <sys/time.h> declares `struct tm'. */
 #undef TM_IN_SYS_TIME
 /* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
   significant byte first (like Motorola and SPARC, unlike Intel). */
 #if defined AC_APPLE_UNIVERSAL_BUILD
 # if defined __BIG_ENDIAN__
 #  define WORDS_BIGENDIAN 1
 # endif
 #else
 # ifndef WORDS_BIGENDIAN
 #  undef WORDS_BIGENDIAN
 # endif
 #endif
 /* Define to empty if `const' does not conform to ANSI C. */
 #undef const
@ -278,8 +296,15 @@
 /* Define to `long int' if <sys/types.h> does not define. */
 #undef off_t
 /* Define to `int' if <sys/types.h> does not define. */
 #undef ptrdiff_t
 /* Define to `unsigned int' if <sys/types.h> does not define. */
 #undef size_t
 /* Define to `int' if <sys/types.h> does not define. */
 #undef ssize_t
 /* Define to the type of an unsigned integer type wide enough to hold a
   pointer, if such a type exists, and if the system does not define it. */
 #undef uintptr_t
--- a/libcpp/configure
+++ b/libcpp/configure
@ -1846,6 +1846,48 @@ fi
 } # ac_fn_cxx_check_header_mongrel
 # ac_fn_cxx_try_run LINENO
 # ------------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
 # that executables *can* be run.
 ac_fn_cxx_try_run ()
 {
  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
  if { { ac_try="$ac_link"
 case "(($ac_try" in
  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
  *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
  (eval "$ac_link") 2>&5
  ac_status=$?
  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
  { { case "(($ac_try" in
  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
  *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
  (eval "$ac_try") 2>&5
  ac_status=$?
  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
  test $ac_status = 0; }; }; then :
  ac_retval=0
 else
  $as_echo "$as_me: program exited with status $ac_status" >&5
       $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
       ac_retval=$ac_status
 fi
  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
  return $ac_retval
 } # ac_fn_cxx_try_run
 # ac_fn_cxx_try_link LINENO
 # -------------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded.
@ -1946,48 +1988,6 @@ $as_echo "$ac_res" >&6; }
 } # ac_fn_cxx_check_type
 # ac_fn_cxx_try_run LINENO
 # ------------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
 # that executables *can* be run.
 ac_fn_cxx_try_run ()
 {
  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
  if { { ac_try="$ac_link"
 case "(($ac_try" in
  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
  *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
  (eval "$ac_link") 2>&5
  ac_status=$?
  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
  { { case "(($ac_try" in
  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
  *) ac_try_echo=$ac_try;;
 esac
 eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
 $as_echo "$ac_try_echo"; } >&5
  (eval "$ac_try") 2>&5
  ac_status=$?
  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
  test $ac_status = 0; }; }; then :
  ac_retval=0
 else
  $as_echo "$as_me: program exited with status $ac_status" >&5
       $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
       ac_retval=$ac_status
 fi
  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
  return $ac_retval
 } # ac_fn_cxx_try_run
 # ac_fn_cxx_compute_int LINENO EXPR VAR INCLUDES
 # ----------------------------------------------
 # Tries to find the compile-time value of EXPR in a program that includes
@ -5172,6 +5172,230 @@ done
 fi
 # Checks for typedefs, structures, and compiler characteristics.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
 $as_echo_n "checking whether byte ordering is bigendian... " >&6; }
 if test "${ac_cv_c_bigendian+set}" = set; then :
  $as_echo_n "(cached) " >&6
 else
  ac_cv_c_bigendian=unknown
    # See if we're dealing with a universal compiler.
    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #ifndef __APPLE_CC__
 	       not a universal capable compiler
 	     #endif
 	     typedef int dummy;
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
 	# Check for potential -arch flags.  It is not universal unless
 	# there are at least two -arch flags with different values.
 	ac_arch=
 	ac_prev=
 	for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do
 	 if test -n "$ac_prev"; then
 	   case $ac_word in
 	     i?86 | x86_64 | ppc | ppc64)
 	       if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then
 		 ac_arch=$ac_word
 	       else
 		 ac_cv_c_bigendian=universal
 		 break
 	       fi
 	       ;;
 	   esac
 	   ac_prev=
 	 elif test "x$ac_word" = "x-arch"; then
 	   ac_prev=arch
 	 fi
       done
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
    if test $ac_cv_c_bigendian = unknown; then
      # See if sys/param.h defines the BYTE_ORDER macro.
      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <sys/types.h>
 	     #include <sys/param.h>
 int
 main ()
 {
 #if ! (defined BYTE_ORDER && defined BIG_ENDIAN \
 		     && defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \
 		     && LITTLE_ENDIAN)
 	      bogus endian macros
 	     #endif
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  # It does; now see whether it defined to BIG_ENDIAN or not.
 	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <sys/types.h>
 		#include <sys/param.h>
 int
 main ()
 {
 #if BYTE_ORDER != BIG_ENDIAN
 		 not big endian
 		#endif
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  ac_cv_c_bigendian=yes
 else
  ac_cv_c_bigendian=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
    fi
    if test $ac_cv_c_bigendian = unknown; then
      # See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris).
      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <limits.h>
 int
 main ()
 {
 #if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN)
 	      bogus endian macros
 	     #endif
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  # It does; now see whether it defined to _BIG_ENDIAN or not.
 	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <limits.h>
 int
 main ()
 {
 #ifndef _BIG_ENDIAN
 		 not big endian
 		#endif
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  ac_cv_c_bigendian=yes
 else
  ac_cv_c_bigendian=no
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
    fi
    if test $ac_cv_c_bigendian = unknown; then
      # Compile a test program.
      if test "$cross_compiling" = yes; then :
  # Try to guess by grepping values from an object file.
 	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 short int ascii_mm[] =
 		  { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
 		short int ascii_ii[] =
 		  { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
 		int use_ascii (int i) {
 		  return ascii_mm[i] + ascii_ii[i];
 		}
 		short int ebcdic_ii[] =
 		  { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
 		short int ebcdic_mm[] =
 		  { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
 		int use_ebcdic (int i) {
 		  return ebcdic_mm[i] + ebcdic_ii[i];
 		}
 		extern int foo;
 int
 main ()
 {
 return use_ascii (foo) == use_ebcdic (foo);
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
  if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then
 	      ac_cv_c_bigendian=yes
 	    fi
 	    if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
 	      if test "$ac_cv_c_bigendian" = unknown; then
 		ac_cv_c_bigendian=no
 	      else
 		# finding both strings is unlikely to happen, but who knows?
 		ac_cv_c_bigendian=unknown
 	      fi
 	    fi
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 else
  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $ac_includes_default
 int
 main ()
 {
 	     /* Are we little or big endian?  From Harbison&Steele.  */
 	     union
 	     {
 	       long int l;
 	       char c[sizeof (long int)];
 	     } u;
 	     u.l = 1;
 	     return u.c[sizeof (long int) - 1] == 1;
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_run "$LINENO"; then :
  ac_cv_c_bigendian=no
 else
  ac_cv_c_bigendian=yes
 fi
 rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
  conftest.$ac_objext conftest.beam conftest.$ac_ext
 fi
    fi
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5
 $as_echo "$ac_cv_c_bigendian" >&6; }
 case $ac_cv_c_bigendian in #(
   yes)
     $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h
 ;; #(
   no)
      ;; #(
   universal)
 $as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
     ;; #(
   *)
     as_fn_error "unknown endianness
 presetting ac_cv_c_bigendian=no (or yes) will help" "$LINENO" 5 ;;
 esac
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5
 $as_echo_n "checking for an ANSI C-conforming const... " >&6; }
 if test "${ac_cv_c_const+set}" = set; then :
@ -5369,6 +5593,53 @@ cat >>confdefs.h <<_ACEOF
 #define ssize_t int
 _ACEOF
 fi
  ac_fn_cxx_check_type "$LINENO" "uintptr_t" "ac_cv_type_uintptr_t" "$ac_includes_default"
 if test "x$ac_cv_type_uintptr_t" = x""yes; then :
 $as_echo "#define HAVE_UINTPTR_T 1" >>confdefs.h
 else
  for ac_type in 'unsigned int' 'unsigned long int' \
 	'unsigned long long int'; do
       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 $ac_includes_default
 int
 main ()
 {
 static int test_array [1 - 2 * !(sizeof (void *) <= sizeof ($ac_type))];
 test_array [0] = 0
  ;
  return 0;
 }
 _ACEOF
 if ac_fn_cxx_try_compile "$LINENO"; then :
 cat >>confdefs.h <<_ACEOF
 #define uintptr_t $ac_type
 _ACEOF
 	  ac_type=
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
       test -z "$ac_type" && break
     done
 fi
 ac_fn_cxx_check_type "$LINENO" "ptrdiff_t" "ac_cv_type_ptrdiff_t" "$ac_includes_default"
 if test "x$ac_cv_type_ptrdiff_t" = x""yes; then :
 else
 cat >>confdefs.h <<_ACEOF
 #define ptrdiff_t int
 _ACEOF
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether struct tm is in sys/time.h or time.h" >&5
@ -7042,6 +7313,7 @@ LTLIBOBJS=$ac_ltlibobjs
 : ${CONFIG_STATUS=./config.status}
 ac_write_fail=0
 ac_clean_files_save=$ac_clean_files
--- a/libcpp/configure.ac
+++ b/libcpp/configure.ac
@ -70,12 +70,15 @@ else
 fi
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_BIGENDIAN
 AC_C_CONST
 AC_C_INLINE
 AC_FUNC_OBSTACK
 AC_TYPE_OFF_T
 AC_TYPE_SIZE_T
-AC_CHECK_TYPE(ssize_t, int)
+AC_TYPE_SSIZE_T
 AC_TYPE_UINTPTR_T
 AC_CHECK_TYPE(ptrdiff_t, int)
 AC_STRUCT_TM
 AC_CHECK_SIZEOF(int)
 AC_CHECK_SIZEOF(long)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@ -1,5 +1,5 @@
 /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   Free Software Foundation, Inc.
   Contributed by Per Bothner, 1994-95.
   Based on CCCP program by Paul Rubin, June 1986
@ -96,6 +96,531 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  buffer->notes_used++;
 }
 /* Fast path to find line special characters using optimized character
   scanning algorithms.  Anything complicated falls back to the slow
   path below.  Since this loop is very hot it's worth doing these kinds
   of optimizations.
   One of the paths through the ifdefs should provide 
     const uchar *search_line_fast (const uchar *s, const uchar *end);
   Between S and END, search for \n, \r, \\, ?.  Return a pointer to
   the found character.
   Note that the last character of the buffer is *always* a newline,
   as forced by _cpp_convert_input.  This fact can be used to avoid
   explicitly looking for the end of the buffer.  */
 /* Configure gives us an ifdef test.  */
 #ifndef WORDS_BIGENDIAN
 #define WORDS_BIGENDIAN 0
 #endif
 /* We'd like the largest integer that fits into a register.  There's nothing
   in <stdint.h> that gives us that.  For most hosts this is unsigned long,
   but MS decided on an LLP64 model.  Thankfully when building with GCC we
   can get the "real" word size.  */
 #ifdef __GNUC__
 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 #else
 typedef unsigned long word_type;
 #endif
 /* The code below is only expecting sizes 4 or 8.
   Die at compile-time if this expectation is violated.  */
 typedef char check_word_type_size
  [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 /* Return X with the first N bytes forced to values that won't match one
   of the interesting characters.  Note that NUL is not interesting.  */
 static inline word_type
 acc_char_mask_misalign (word_type val, unsigned int n)
 {
  word_type mask = -1;
  if (WORDS_BIGENDIAN)
    mask >>= n * 8;
  else
    mask <<= n * 8;
  return val & mask;
 }
 /* Return X replicated to all byte positions within WORD_TYPE.  */
 static inline word_type
 acc_char_replicate (uchar x)
 {
  word_type ret;
  ret = (x << 24) | (x << 16) | (x << 8) | x;
  if (sizeof(word_type) == 8)
    ret = (ret << 16 << 16) | ret;
  return ret;
 }
 /* Return non-zero if some byte of VAL is (probably) C.  */
 static inline word_type
 acc_char_cmp (word_type val, word_type c)
 {
 #if defined(__GNUC__) && defined(__alpha__)
  /* We can get exact results using a compare-bytes instruction.  
     Get (val == c) via (0 >= (val ^ c)).  */
  return __builtin_alpha_cmpbge (0, val ^ c);
 #else
  word_type magic = 0x7efefefeU;
  if (sizeof(word_type) == 8)
    magic = (magic << 16 << 16) | 0xfefefefeU;
  magic |= 1;
  val ^= c;
  return ((val + magic) ^ ~val) & ~magic;
 #endif
 }
 /* Given the result of acc_char_cmp is non-zero, return the index of
   the found character.  If this was a false positive, return -1.  */
 static inline int
 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 		word_type val ATTRIBUTE_UNUSED)
 {
 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
  /* The cmpbge instruction sets *bits* of the result corresponding to
     matches in the bytes with no false positives.  */
  return __builtin_ctzl (cmp);
 #else
  unsigned int i;
  /* ??? It would be nice to force unrolling here,
     and have all of these constants folded.  */
  for (i = 0; i < sizeof(word_type); ++i)
    {
      uchar c;
      if (WORDS_BIGENDIAN)
 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
      else
 	c = (val >> i * 8) & 0xff;
      if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 	return i;
    }
  return -1;
 #endif
 }
 /* A version of the fast scanner using bit fiddling techniques.
   For 32-bit words, one would normally perform 16 comparisons and
   16 branches.  With this algorithm one performs 24 arithmetic
   operations and one branch.  Whether this is faster with a 32-bit
   word size is going to be somewhat system dependent.
   For 64-bit words, we eliminate twice the number of comparisons
   and branches without increasing the number of arithmetic operations.
   It's almost certainly going to be a win with 64-bit word size.  */
 static const uchar * search_line_acc_char (const uchar *, const uchar *)
  ATTRIBUTE_UNUSED;
 static const uchar *
 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
  const word_type repl_nl = acc_char_replicate ('\n');
  const word_type repl_cr = acc_char_replicate ('\r');
  const word_type repl_bs = acc_char_replicate ('\\');
  const word_type repl_qm = acc_char_replicate ('?');
  unsigned int misalign;
  const word_type *p;
  word_type val, t;
  /* Align the buffer.  Mask out any bytes from before the beginning.  */
  p = (word_type *)((uintptr_t)s & -sizeof(word_type));
  val = *p;
  misalign = (uintptr_t)s & (sizeof(word_type) - 1);
  if (misalign)
    val = acc_char_mask_misalign (val, misalign);
  /* Main loop.  */
  while (1)
    {
      t  = acc_char_cmp (val, repl_nl);
      t |= acc_char_cmp (val, repl_cr);
      t |= acc_char_cmp (val, repl_bs);
      t |= acc_char_cmp (val, repl_qm);
      if (__builtin_expect (t != 0, 0))
 	{
 	  int i = acc_char_index (t, val);
 	  if (i >= 0)
 	    return (const uchar *)p + i;
 	}
      val = *++p;
    }
 }
 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
 /* Replicated character data to be shared between implementations.
   Recall that outside of a context with vector support we can't
   define compatible vector types, therefore these are all defined
   in terms of raw characters.  */
 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
  { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
  { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
  { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
  { '?', '?', '?', '?', '?', '?', '?', '?',
    '?', '?', '?', '?', '?', '?', '?', '?' },
 };
 /* A version of the fast scanner using MMX vectorized byte compare insns.
   This uses the PMOVMSKB instruction which was introduced with "MMX2",
   which was packaged into SSE1; it is also present in the AMD 3dNOW-A
   extension.  Mark the function as using "sse" so that we emit a real
   "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 static const uchar *
 #ifndef __SSE__
 __attribute__((__target__("sse")))
 #endif
 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
  typedef char v8qi __attribute__ ((__vector_size__ (8)));
  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
  const v8qi repl_qm = *(const v8qi *)repl_chars[3];
  unsigned int misalign, found, mask;
  const v8qi *p;
  v8qi data, t, c;
  /* Align the source pointer.  While MMX doesn't generate unaligned data
     faults, this allows us to safely scan to the end of the buffer without
     reading beyond the end of the last page.  */
  misalign = (uintptr_t)s & 7;
  p = (const v8qi *)((uintptr_t)s & -8);
  data = *p;
  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = -1u << misalign;
  /* Main loop processing 8 bytes at a time.  */
  goto start;
  do
    {
      data = *++p;
      mask = -1;
    start:
      t = __builtin_ia32_pcmpeqb(data, repl_nl);
      c = __builtin_ia32_pcmpeqb(data, repl_cr);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      c = __builtin_ia32_pcmpeqb(data, repl_bs);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      c = __builtin_ia32_pcmpeqb(data, repl_qm);
      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
      found = __builtin_ia32_pmovmskb (t);
      found &= mask;
    }
  while (!found);
  __builtin_ia32_emms ();
  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
 }
 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 static const uchar *
 #ifndef __SSE2__
 __attribute__((__target__("sse2")))
 #endif
 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
  typedef char v16qi __attribute__ ((__vector_size__ (16)));
  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
  const v16qi repl_qm = *(const v16qi *)repl_chars[3];
  unsigned int misalign, found, mask;
  const v16qi *p;
  v16qi data, t;
  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const v16qi *)((uintptr_t)s & -16);
  data = *p;
  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = -1u << misalign;
  /* Main loop processing 16 bytes at a time.  */
  goto start;
  do
    {
      data = *++p;
      mask = -1;
    start:
      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
      found &= mask;
    }
  while (!found);
  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
 }
 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 static const uchar *
 #ifndef __SSE4_2__
 __attribute__((__target__("sse4.2")))
 #endif
 search_line_sse42 (const uchar *s, const uchar *end)
 {
  typedef char v16qi __attribute__ ((__vector_size__ (16)));
  static const v16qi search = { '\n', '\r', '?', '\\' };
  uintptr_t si = (uintptr_t)s;
  uintptr_t index;
  /* Check for unaligned input.  */
  if (si & 15)
    {
      if (__builtin_expect (end - s < 16, 0)
 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 	{
 	  /* There are less than 16 bytes left in the buffer, and less
 	     than 16 bytes left on the page.  Reading 16 bytes at this
 	     point might generate a spurious page fault.  Defer to the
 	     SSE2 implementation, which already handles alignment.  */
 	  return search_line_sse2 (s, end);
 	}
      /* ??? The builtin doesn't understand that the PCMPESTRI read from
 	 memory need not be aligned.  */
      __asm ("%vpcmpestri $0, (%1), %2"
 	     : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
      if (__builtin_expect (index < 16, 0))
 	goto found;
      /* Advance the pointer to an aligned address.  We will re-scan a
 	 few bytes, but we no longer need care for reading past the
 	 end of a page, since we're guaranteed a match.  */
      s = (const uchar *)((si + 16) & -16);
    }
  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
     in inline assembly, we can make proper use of the flags set.  */
  __asm (      "sub $16, %1\n"
 	"	.balign 16\n"
 	"0:	add $16, %1\n"
 	"	%vpcmpestri $0, (%1), %2\n"
 	"	jnc 0b"
 	: "=&c"(index), "+r"(s)
 	: "x"(search), "a"(4), "d"(16));
 found:
  return s + index;
 }
 /* Check the CPU capabilities.  */
 #include "../gcc/config/i386/cpuid.h"
 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 static search_line_fast_type search_line_fast;
 static void __attribute__((constructor))
 init_vectorized_lexer (void)
 {
  unsigned dummy, ecx = 0, edx = 0;
  search_line_fast_type impl = search_line_acc_char;
  int minimum = 0;
 #if defined(__SSE4_2__)
  minimum = 3;
 #elif defined(__SSE2__)
  minimum = 2;
 #elif defined(__SSE__) || defined(__3dNOW_A__)
  minimum = 1;
 #endif
  if (minimum == 3)
    impl = search_line_sse42;
  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    {
      if (minimum == 3 || (ecx & bit_SSE4_2))
        impl = search_line_sse42;
      else if (minimum == 2 || (edx & bit_SSE2))
 	impl = search_line_sse2;
      else if (minimum == 1 || (edx & bit_SSE))
 	impl = search_line_mmx;
    }
  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    {
      if (minimum == 1 || edx & bit_3DNOWP)
 	impl = search_line_mmx;
    }
  search_line_fast = impl;
 }
 #elif defined(__GNUC__) && defined(__ALTIVEC__)
 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
   so we can't compile this function without -maltivec on the command line
   (or implied by some other switch).  */
 static const uchar *
 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 {
  typedef __attribute__((altivec(vector))) unsigned char vc;
  const vc repl_nl = {
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
  };
  const vc repl_cr = {
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
  };
  const vc repl_bs = {
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
  };
  const vc repl_qm = {
    '?', '?', '?', '?', '?', '?', '?', '?', 
    '?', '?', '?', '?', '?', '?', '?', '?', 
  };
  const vc ones = {
    -1, -1, -1, -1, -1, -1, -1, -1,
    -1, -1, -1, -1, -1, -1, -1, -1,
  };
  const vc zero = { 0 };
  vc data, mask, t;
  /* Altivec loads automatically mask addresses with -16.  This lets us
     issue the first load as early as possible.  */
  data = __builtin_vec_ld(0, (const vc *)s);
  /* Discard bytes before the beginning of the buffer.  Do this by
     beginning with all ones and shifting in zeros according to the
     mis-alignment.  The LVSR instruction pulls the exact shift we
     want from the address.  */
  mask = __builtin_vec_lvsr(0, s);
  mask = __builtin_vec_perm(zero, ones, mask);
  data &= mask;
  /* While altivec loads mask addresses, we still need to align S so
     that the offset we compute at the end is correct.  */
  s = (const uchar *)((uintptr_t)s & -16);
  /* Main loop processing 16 bytes at a time.  */
  goto start;
  do
    {
      vc m_nl, m_cr, m_bs, m_qm;
      s += 16;
      data = __builtin_vec_ld(0, (const vc *)s);
    start:
      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
      t = (m_nl | m_cr) | (m_bs | m_qm);
      /* T now contains 0xff in bytes for which we matched one of the relevant
 	 characters.  We want to exit the loop if any byte in T is non-zero.
 	 Below is the expansion of vec_any_ne(t, zero).  */
    }
  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
  {
 #define N  (sizeof(vc) / sizeof(long))
    typedef char check_count[(N == 2 || N == 4) * 2 - 1];
    union {
      vc v;
      unsigned long l[N];
    } u;
    unsigned long l, i = 0;
    u.v = t;
    /* Find the first word of T that is non-zero.  */
    switch (N)
      {
      case 4:
 	l = u.l[i++];
 	if (l != 0)
 	  break;
 	s += sizeof(unsigned long);
 	l = u.l[i++];
 	if (l != 0)
 	  break;
 	s += sizeof(unsigned long);
      case 2:
 	l = u.l[i++];
 	if (l != 0)
 	  break;
 	s += sizeof(unsigned long);
 	l = u.l[i];
      }
    /* L now contains 0xff in bytes for which we matched one of the
       relevant characters.  We can find the byte index by finding
       its bit index and dividing by 8.  */
    l = __builtin_clzl(l) >> 3;
    return s + l;
 #undef N
  }
 }
 #else
 /* We only have one accellerated alternative.  Use a direct call so that
   we encourage inlining.  */
 #define search_line_fast  search_line_acc_char
 #endif
 /* Returns with a logical line that contains no escaped newlines or
   trigraphs.  This is a time-critical inner loop.  */
 void
@ -109,82 +634,91 @@ _cpp_clean_line (cpp_reader *pfile)
  buffer->cur_note = buffer->notes_used = 0;
  buffer->cur = buffer->line_base = buffer->next_line;
  buffer->need_line = false;
-  s = buffer->next_line - 1;
+  s = buffer->next_line;
  if (!buffer->from_stage3)
    {
      const uchar *pbackslash = NULL;
-      /* Short circuit for the common case of an un-escaped line with
+      /* Fast path.  This is the common case of an un-escaped line with
 	 no trigraphs.  The primary win here is by not writing any
 	 data back to memory until we have to.  */
-      for (;;)
+      while (1)
 	{
-	  c = *++s;
+	  /* Perform an optimized search for \n, \r, \\, ?.  */
-	  if (__builtin_expect (c == '\n', false)
+	  s = search_line_fast (s, buffer->rlimit);
-	      || __builtin_expect (c == '\r', false))
+
 	  c = *s;
 	  if (c == '\\')
 	    {
-	      d = (uchar *) s;
+	      /* Record the location of the backslash and continue.  */
-
+	      pbackslash = s++;
 	      if (__builtin_expect (s == buffer->rlimit, false))
 		goto done;
 	      /* DOS line ending? */
 	      if (__builtin_expect (c == '\r', false)
 		  && s[1] == '\n')
 		{
 		  s++;
 		  if (s == buffer->rlimit)
 		    goto done;
 		}
 	      if (__builtin_expect (pbackslash == NULL, true))
 		goto done;
 	      /* Check for escaped newline.  */
 	      p = d;
 	      while (is_nvspace (p[-1]))
 		p--;
 	      if (p - 1 != pbackslash)
 		goto done;
 	      /* Have an escaped newline; process it and proceed to
 		 the slow path.  */
 	      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 	      d = p - 2;
 	      buffer->next_line = p - 1;
 	      break;
 	    }
-	  if (__builtin_expect (c == '\\', false))
+	  else if (__builtin_expect (c == '?', 0))
-	    pbackslash = s;
+	    {
-	  else if (__builtin_expect (c == '?', false)
+	      if (__builtin_expect (s[1] == '?', false)
 		   && __builtin_expect (s[1] == '?', false)
 		   && _cpp_trigraph_map[s[2]])
 	    {
 	      /* Have a trigraph.  We may or may not have to convert
 		 it.  Add a line note regardless, for -Wtrigraphs.  */
 	      add_line_note (buffer, s, s[2]);
 	      if (CPP_OPTION (pfile, trigraphs))
 		{
-		  /* We do, and that means we have to switch to the
+		  /* Have a trigraph.  We may or may not have to convert
-		     slow path.  */
+		     it.  Add a line note regardless, for -Wtrigraphs.  */
-		  d = (uchar *) s;
+		  add_line_note (buffer, s, s[2]);
-		  *d = _cpp_trigraph_map[s[2]];
+		  if (CPP_OPTION (pfile, trigraphs))
-		  s += 2;
+		    {
-		  break;
+		      /* We do, and that means we have to switch to the
 		         slow path.  */
 		      d = (uchar *) s;
 		      *d = _cpp_trigraph_map[s[2]];
 		      s += 2;
 		      goto slow_path;
 		    }
 		}
 	      /* Not a trigraph.  Continue on fast-path.  */
 	      s++;
 	    }
 	  else
 	    break;
 	}
      /* This must be \r or \n.  We're either done, or we'll be forced
 	 to write back to the buffer and continue on the slow path.  */
      d = (uchar *) s;
-      for (;;)
+      if (__builtin_expect (s == buffer->rlimit, false))
 	goto done;
      /* DOS line ending? */
      if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 	{
 	  s++;
 	  if (s == buffer->rlimit)
 	    goto done;
 	}
      if (__builtin_expect (pbackslash == NULL, true))
 	goto done;
      /* Check for escaped newline.  */
      p = d;
      while (is_nvspace (p[-1]))
 	p--;
      if (p - 1 != pbackslash)
 	goto done;
      /* Have an escaped newline; process it and proceed to
 	 the slow path.  */
      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
      d = p - 2;
      buffer->next_line = p - 1;
    slow_path:
      while (1)
 	{
 	  c = *++s;
 	  *++d = c;
 	  if (c == '\n' || c == '\r')
 	    {
-		  /* Handle DOS line endings.  */
+	      /* Handle DOS line endings.  */
 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 		s++;
 	      if (s == buffer->rlimit)
@ -215,9 +749,8 @@ _cpp_clean_line (cpp_reader *pfile)
    }
  else
    {
-      do
+      while (*s != '\n' && *s != '\r')
 	s++;
      while (*s != '\n' && *s != '\r');
      d = (uchar *) s;
      /* Handle DOS line endings.  */
--- a/libcpp/system.h
+++ b/libcpp/system.h
@ -29,6 +29,9 @@ along with GCC; see the file COPYING3.  If not see
 #ifdef HAVE_STDDEF_H
 # include <stddef.h>
 #endif
 #ifdef HAVE_STDINT_H
 # include <stdint.h>
 #endif
 #include <stdio.h>