Vectorize fast path of _cpp_clean_line.

* configure.ac (AC_C_BIGENDIAN, AC_TYPE_UINTPTR_T): New tests. (ssize_t): Check via AC_TYPE_SSIZE_T instead of AC_CHECK_TYPE. (ptrdiff_t): Check via AC_CHECK_TYPE. * config.in, configure: Rebuild. * system.h: Include stdint.h, if available. * lex.c (WORDS_BIGENDIAN): Provide default. (acc_char_mask_misalign, acc_char_replicate, acc_char_cmp, acc_char_index, search_line_acc_char, repl_chars, search_line_mmx, search_line_sse2, search_line_sse42, init_vectorized_lexer, search_line_fast): New. (_cpp_clean_line): Use search_line_fast. Restructure the fast loop to make it clear when we're leaving the loop. Stay in the fast loop for non-trigraph '?'. Co-Authored-By: Andi Kleen <ak@linux.intel.com> Co-Authored-By: David S. Miller <davem@davemloft.net> From-SVN: r163446
2010-08-21 12:05:40 -07:00 · 2010-08-21 12:05:40 -07:00 · 246a2fcb5e
parent 1d0134b3cc
commit 246a2fcb5e
6 changed files with 953 additions and 99 deletions
--- a/libcpp/ChangeLog
+++ b/libcpp/ChangeLog
@ -1,3 +1,21 @@
+2010-08-21  Richard Henderson  <rth@redhat.com>
+	    Andi Kleen <ak@linux.intel.com>
+	    David S. Miller  <davem@davemloft.net>
+
+	* configure.ac (AC_C_BIGENDIAN, AC_TYPE_UINTPTR_T): New tests.
+	(ssize_t): Check via AC_TYPE_SSIZE_T instead of AC_CHECK_TYPE.
+	(ptrdiff_t): Check via AC_CHECK_TYPE.
+	* config.in, configure: Rebuild.
+	* system.h: Include stdint.h, if available.
+	* lex.c (WORDS_BIGENDIAN): Provide default.
+	(acc_char_mask_misalign, acc_char_replicate, acc_char_cmp,
+	acc_char_index, search_line_acc_char, repl_chars, search_line_mmx,
+	search_line_sse2, search_line_sse42, init_vectorized_lexer,
+	search_line_fast): New.
+	(_cpp_clean_line): Use search_line_fast.  Restructure the fast
+	loop to make it clear when we're leaving the loop.  Stay in the
+	fast loop for non-trigraph '?'.
+
 2010-06-11  Jakub Jelinek  <jakub@redhat.com>

 	* include/cpplib.h (struct cpp_callbacks): Add user_builtin_macro
--- a/libcpp/config.in
+++ b/libcpp/config.in
@ -1,5 +1,8 @@
 /* config.in.  Generated from configure.ac by autoheader.  */

+/* Define if building universal (internal helper macro) */
+#undef AC_APPLE_UNIVERSAL_BUILD
+
 /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
   systems. This function is required for `alloca.c' support on those systems.
   */
@ -209,6 +212,9 @@
 /* Define if <sys/types.h> defines \`uchar'. */
 #undef HAVE_UCHAR

+/* Define to 1 if the system has the type `uintptr_t'. */
+#undef HAVE_UINTPTR_T
+
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

@ -266,6 +272,18 @@
 /* Define to 1 if your <sys/time.h> declares `struct tm'. */
 #undef TM_IN_SYS_TIME

+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+#  undef WORDS_BIGENDIAN
+# endif
+#endif
+
 /* Define to empty if `const' does not conform to ANSI C. */
 #undef const

@ -278,8 +296,15 @@
 /* Define to `long int' if <sys/types.h> does not define. */
 #undef off_t

+/* Define to `int' if <sys/types.h> does not define. */
+#undef ptrdiff_t
+
 /* Define to `unsigned int' if <sys/types.h> does not define. */
 #undef size_t

 /* Define to `int' if <sys/types.h> does not define. */
 #undef ssize_t
+
+/* Define to the type of an unsigned integer type wide enough to hold a
+   pointer, if such a type exists, and if the system does not define it. */
+#undef uintptr_t
--- a/libcpp/configure
+++ b/libcpp/configure
@ -1846,6 +1846,48 @@ fi

 } # ac_fn_cxx_check_header_mongrel

+# ac_fn_cxx_try_run LINENO
+# ------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_cxx_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
+  return $ac_retval
+
+} # ac_fn_cxx_try_run
+
 # ac_fn_cxx_try_link LINENO
 # -------------------------
 # Try to link conftest.$ac_ext, and return whether this succeeded.
@ -1946,48 +1988,6 @@ $as_echo "$ac_res" >&6; }

 } # ac_fn_cxx_check_type

-# ac_fn_cxx_try_run LINENO
-# ------------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_cxx_try_run ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: program exited with status $ac_status" >&5
-       $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-       ac_retval=$ac_status
-fi
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  return $ac_retval
-
-} # ac_fn_cxx_try_run
-
 # ac_fn_cxx_compute_int LINENO EXPR VAR INCLUDES
 # ----------------------------------------------
 # Tries to find the compile-time value of EXPR in a program that includes
@ -5172,6 +5172,230 @@ done
 fi

 # Checks for typedefs, structures, and compiler characteristics.
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
+$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
+if test "${ac_cv_c_bigendian+set}" = set; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_c_bigendian=unknown
+    # See if we're dealing with a universal compiler.
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifndef __APPLE_CC__
+	       not a universal capable compiler
+	     #endif
+	     typedef int dummy;
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+	# Check for potential -arch flags.  It is not universal unless
+	# there are at least two -arch flags with different values.
+	ac_arch=
+	ac_prev=
+	for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do
+	 if test -n "$ac_prev"; then
+	   case $ac_word in
+	     i?86 | x86_64 | ppc | ppc64)
+	       if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then
+		 ac_arch=$ac_word
+	       else
+		 ac_cv_c_bigendian=universal
+		 break
+	       fi
+	       ;;
+	   esac
+	   ac_prev=
+	 elif test "x$ac_word" = "x-arch"; then
+	   ac_prev=arch
+	 fi
+       done
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    if test $ac_cv_c_bigendian = unknown; then
+      # See if sys/param.h defines the BYTE_ORDER macro.
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+	     #include <sys/param.h>
+
+int
+main ()
+{
+#if ! (defined BYTE_ORDER && defined BIG_ENDIAN \
+		     && defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \
+		     && LITTLE_ENDIAN)
+	      bogus endian macros
+	     #endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  # It does; now see whether it defined to BIG_ENDIAN or not.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <sys/types.h>
+		#include <sys/param.h>
+
+int
+main ()
+{
+#if BYTE_ORDER != BIG_ENDIAN
+		 not big endian
+		#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_c_bigendian=yes
+else
+  ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    fi
+    if test $ac_cv_c_bigendian = unknown; then
+      # See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris).
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+
+int
+main ()
+{
+#if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN)
+	      bogus endian macros
+	     #endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  # It does; now see whether it defined to _BIG_ENDIAN or not.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+
+int
+main ()
+{
+#ifndef _BIG_ENDIAN
+		 not big endian
+		#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  ac_cv_c_bigendian=yes
+else
+  ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    fi
+    if test $ac_cv_c_bigendian = unknown; then
+      # Compile a test program.
+      if test "$cross_compiling" = yes; then :
+  # Try to guess by grepping values from an object file.
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+short int ascii_mm[] =
+		  { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
+		short int ascii_ii[] =
+		  { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
+		int use_ascii (int i) {
+		  return ascii_mm[i] + ascii_ii[i];
+		}
+		short int ebcdic_ii[] =
+		  { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
+		short int ebcdic_mm[] =
+		  { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
+		int use_ebcdic (int i) {
+		  return ebcdic_mm[i] + ebcdic_ii[i];
+		}
+		extern int foo;
+
+int
+main ()
+{
+return use_ascii (foo) == use_ebcdic (foo);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then
+	      ac_cv_c_bigendian=yes
+	    fi
+	    if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
+	      if test "$ac_cv_c_bigendian" = unknown; then
+		ac_cv_c_bigendian=no
+	      else
+		# finding both strings is unlikely to happen, but who knows?
+		ac_cv_c_bigendian=unknown
+	      fi
+	    fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+
+	     /* Are we little or big endian?  From Harbison&Steele.  */
+	     union
+	     {
+	       long int l;
+	       char c[sizeof (long int)];
+	     } u;
+	     u.l = 1;
+	     return u.c[sizeof (long int) - 1] == 1;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+  ac_cv_c_bigendian=no
+else
+  ac_cv_c_bigendian=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+    fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5
+$as_echo "$ac_cv_c_bigendian" >&6; }
+ case $ac_cv_c_bigendian in #(
+   yes)
+     $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h
+;; #(
+   no)
+      ;; #(
+   universal)
+
+$as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
+
+     ;; #(
+   *)
+     as_fn_error "unknown endianness
+ presetting ac_cv_c_bigendian=no (or yes) will help" "$LINENO" 5 ;;
+ esac
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5
 $as_echo_n "checking for an ANSI C-conforming const... " >&6; }
 if test "${ac_cv_c_const+set}" = set; then :
@ -5369,6 +5593,53 @@ cat >>confdefs.h <<_ACEOF
 #define ssize_t int
 _ACEOF

+fi
+
+
+  ac_fn_cxx_check_type "$LINENO" "uintptr_t" "ac_cv_type_uintptr_t" "$ac_includes_default"
+if test "x$ac_cv_type_uintptr_t" = x""yes; then :
+
+$as_echo "#define HAVE_UINTPTR_T 1" >>confdefs.h
+
+else
+  for ac_type in 'unsigned int' 'unsigned long int' \
+	'unsigned long long int'; do
+       cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(sizeof (void *) <= sizeof ($ac_type))];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define uintptr_t $ac_type
+_ACEOF
+
+	  ac_type=
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+       test -z "$ac_type" && break
+     done
+fi
+
+
+ac_fn_cxx_check_type "$LINENO" "ptrdiff_t" "ac_cv_type_ptrdiff_t" "$ac_includes_default"
+if test "x$ac_cv_type_ptrdiff_t" = x""yes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define ptrdiff_t int
+_ACEOF
+
 fi

 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether struct tm is in sys/time.h or time.h" >&5
@ -7042,6 +7313,7 @@ LTLIBOBJS=$ac_ltlibobjs



+
 : ${CONFIG_STATUS=./config.status}
 ac_write_fail=0
 ac_clean_files_save=$ac_clean_files
--- a/libcpp/configure.ac
+++ b/libcpp/configure.ac
@ -70,12 +70,15 @@ else
 fi

 # Checks for typedefs, structures, and compiler characteristics.
+AC_C_BIGENDIAN
 AC_C_CONST
 AC_C_INLINE
 AC_FUNC_OBSTACK
 AC_TYPE_OFF_T
 AC_TYPE_SIZE_T
-AC_CHECK_TYPE(ssize_t, int)
+AC_TYPE_SSIZE_T
+AC_TYPE_UINTPTR_T
+AC_CHECK_TYPE(ptrdiff_t, int)
 AC_STRUCT_TM
 AC_CHECK_SIZEOF(int)
 AC_CHECK_SIZEOF(long)
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@ -1,5 +1,5 @@
 /* CPP Library - lexical analysis.
-   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009
+   Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010
   Free Software Foundation, Inc.
   Contributed by Per Bothner, 1994-95.
   Based on CCCP program by Paul Rubin, June 1986
@ -96,6 +96,531 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  buffer->notes_used++;
 }

+
+/* Fast path to find line special characters using optimized character
+   scanning algorithms.  Anything complicated falls back to the slow
+   path below.  Since this loop is very hot it's worth doing these kinds
+   of optimizations.
+
+   One of the paths through the ifdefs should provide 
+
+     const uchar *search_line_fast (const uchar *s, const uchar *end);
+
+   Between S and END, search for \n, \r, \\, ?.  Return a pointer to
+   the found character.
+
+   Note that the last character of the buffer is *always* a newline,
+   as forced by _cpp_convert_input.  This fact can be used to avoid
+   explicitly looking for the end of the buffer.  */
+
+/* Configure gives us an ifdef test.  */
+#ifndef WORDS_BIGENDIAN
+#define WORDS_BIGENDIAN 0
+#endif
+
+/* We'd like the largest integer that fits into a register.  There's nothing
+   in <stdint.h> that gives us that.  For most hosts this is unsigned long,
+   but MS decided on an LLP64 model.  Thankfully when building with GCC we
+   can get the "real" word size.  */
+#ifdef __GNUC__
+typedef unsigned int word_type __attribute__((__mode__(__word__)));
+#else
+typedef unsigned long word_type;
+#endif
+
+/* The code below is only expecting sizes 4 or 8.
+   Die at compile-time if this expectation is violated.  */
+typedef char check_word_type_size
+  [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
+
+/* Return X with the first N bytes forced to values that won't match one
+   of the interesting characters.  Note that NUL is not interesting.  */
+
+static inline word_type
+acc_char_mask_misalign (word_type val, unsigned int n)
+{
+  word_type mask = -1;
+  if (WORDS_BIGENDIAN)
+    mask >>= n * 8;
+  else
+    mask <<= n * 8;
+  return val & mask;
+}
+
+/* Return X replicated to all byte positions within WORD_TYPE.  */
+
+static inline word_type
+acc_char_replicate (uchar x)
+{
+  word_type ret;
+
+  ret = (x << 24) | (x << 16) | (x << 8) | x;
+  if (sizeof(word_type) == 8)
+    ret = (ret << 16 << 16) | ret;
+  return ret;
+}
+
+/* Return non-zero if some byte of VAL is (probably) C.  */
+
+static inline word_type
+acc_char_cmp (word_type val, word_type c)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+  /* We can get exact results using a compare-bytes instruction.  
+     Get (val == c) via (0 >= (val ^ c)).  */
+  return __builtin_alpha_cmpbge (0, val ^ c);
+#else
+  word_type magic = 0x7efefefeU;
+  if (sizeof(word_type) == 8)
+    magic = (magic << 16 << 16) | 0xfefefefeU;
+  magic |= 1;
+
+  val ^= c;
+  return ((val + magic) ^ ~val) & ~magic;
+#endif
+}
+
+/* Given the result of acc_char_cmp is non-zero, return the index of
+   the found character.  If this was a false positive, return -1.  */
+
+static inline int
+acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
+		word_type val ATTRIBUTE_UNUSED)
+{
+#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
+  /* The cmpbge instruction sets *bits* of the result corresponding to
+     matches in the bytes with no false positives.  */
+  return __builtin_ctzl (cmp);
+#else
+  unsigned int i;
+
+  /* ??? It would be nice to force unrolling here,
+     and have all of these constants folded.  */
+  for (i = 0; i < sizeof(word_type); ++i)
+    {
+      uchar c;
+      if (WORDS_BIGENDIAN)
+	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
+      else
+	c = (val >> i * 8) & 0xff;
+
+      if (c == '\n' || c == '\r' || c == '\\' || c == '?')
+	return i;
+    }
+
+  return -1;
+#endif
+}
+
+/* A version of the fast scanner using bit fiddling techniques.
+ 
+   For 32-bit words, one would normally perform 16 comparisons and
+   16 branches.  With this algorithm one performs 24 arithmetic
+   operations and one branch.  Whether this is faster with a 32-bit
+   word size is going to be somewhat system dependent.
+
+   For 64-bit words, we eliminate twice the number of comparisons
+   and branches without increasing the number of arithmetic operations.
+   It's almost certainly going to be a win with 64-bit word size.  */
+
+static const uchar * search_line_acc_char (const uchar *, const uchar *)
+  ATTRIBUTE_UNUSED;
+
+static const uchar *
+search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  const word_type repl_nl = acc_char_replicate ('\n');
+  const word_type repl_cr = acc_char_replicate ('\r');
+  const word_type repl_bs = acc_char_replicate ('\\');
+  const word_type repl_qm = acc_char_replicate ('?');
+
+  unsigned int misalign;
+  const word_type *p;
+  word_type val, t;
+  
+  /* Align the buffer.  Mask out any bytes from before the beginning.  */
+  p = (word_type *)((uintptr_t)s & -sizeof(word_type));
+  val = *p;
+  misalign = (uintptr_t)s & (sizeof(word_type) - 1);
+  if (misalign)
+    val = acc_char_mask_misalign (val, misalign);
+
+  /* Main loop.  */
+  while (1)
+    {
+      t  = acc_char_cmp (val, repl_nl);
+      t |= acc_char_cmp (val, repl_cr);
+      t |= acc_char_cmp (val, repl_bs);
+      t |= acc_char_cmp (val, repl_qm);
+
+      if (__builtin_expect (t != 0, 0))
+	{
+	  int i = acc_char_index (t, val);
+	  if (i >= 0)
+	    return (const uchar *)p + i;
+	}
+
+      val = *++p;
+    }
+}
+
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+
+/* Replicated character data to be shared between implementations.
+   Recall that outside of a context with vector support we can't
+   define compatible vector types, therefore these are all defined
+   in terms of raw characters.  */
+static const char repl_chars[4][16] __attribute__((aligned(16))) = {
+  { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
+  { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
+  { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
+  { '?', '?', '?', '?', '?', '?', '?', '?',
+    '?', '?', '?', '?', '?', '?', '?', '?' },
+};
+
+/* A version of the fast scanner using MMX vectorized byte compare insns.
+
+   This uses the PMOVMSKB instruction which was introduced with "MMX2",
+   which was packaged into SSE1; it is also present in the AMD 3dNOW-A
+   extension.  Mark the function as using "sse" so that we emit a real
+   "emms" instruction, rather than the 3dNOW "femms" instruction.  */
+
+static const uchar *
+#ifndef __SSE__
+__attribute__((__target__("sse")))
+#endif
+search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef char v8qi __attribute__ ((__vector_size__ (8)));
+  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
+
+  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
+  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
+  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
+  const v8qi repl_qm = *(const v8qi *)repl_chars[3];
+
+  unsigned int misalign, found, mask;
+  const v8qi *p;
+  v8qi data, t, c;
+
+  /* Align the source pointer.  While MMX doesn't generate unaligned data
+     faults, this allows us to safely scan to the end of the buffer without
+     reading beyond the end of the last page.  */
+  misalign = (uintptr_t)s & 7;
+  p = (const v8qi *)((uintptr_t)s & -8);
+  data = *p;
+
+  /* Create a mask for the bytes that are valid within the first
+     16-byte block.  The Idea here is that the AND with the mask
+     within the loop is "free", since we need some AND or TEST
+     insn in order to set the flags for the branch anyway.  */
+  mask = -1u << misalign;
+
+  /* Main loop processing 8 bytes at a time.  */
+  goto start;
+  do
+    {
+      data = *++p;
+      mask = -1;
+
+    start:
+      t = __builtin_ia32_pcmpeqb(data, repl_nl);
+      c = __builtin_ia32_pcmpeqb(data, repl_cr);
+      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
+      c = __builtin_ia32_pcmpeqb(data, repl_bs);
+      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
+      c = __builtin_ia32_pcmpeqb(data, repl_qm);
+      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
+      found = __builtin_ia32_pmovmskb (t);
+      found &= mask;
+    }
+  while (!found);
+
+  __builtin_ia32_emms ();
+
+  /* FOUND contains 1 in bits for which we matched a relevant
+     character.  Conversion to the byte index is trivial.  */
+  found = __builtin_ctz(found);
+  return (const uchar *)p + found;
+}
+
+/* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
+
+static const uchar *
+#ifndef __SSE2__
+__attribute__((__target__("sse2")))
+#endif
+search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef char v16qi __attribute__ ((__vector_size__ (16)));
+
+  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
+  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
+  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
+  const v16qi repl_qm = *(const v16qi *)repl_chars[3];
+
+  unsigned int misalign, found, mask;
+  const v16qi *p;
+  v16qi data, t;
+
+  /* Align the source pointer.  */
+  misalign = (uintptr_t)s & 15;
+  p = (const v16qi *)((uintptr_t)s & -16);
+  data = *p;
+
+  /* Create a mask for the bytes that are valid within the first
+     16-byte block.  The Idea here is that the AND with the mask
+     within the loop is "free", since we need some AND or TEST
+     insn in order to set the flags for the branch anyway.  */
+  mask = -1u << misalign;
+
+  /* Main loop processing 16 bytes at a time.  */
+  goto start;
+  do
+    {
+      data = *++p;
+      mask = -1;
+
+    start:
+      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+      found = __builtin_ia32_pmovmskb128 (t);
+      found &= mask;
+    }
+  while (!found);
+
+  /* FOUND contains 1 in bits for which we matched a relevant
+     character.  Conversion to the byte index is trivial.  */
+  found = __builtin_ctz(found);
+  return (const uchar *)p + found;
+}
+
+/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
+
+static const uchar *
+#ifndef __SSE4_2__
+__attribute__((__target__("sse4.2")))
+#endif
+search_line_sse42 (const uchar *s, const uchar *end)
+{
+  typedef char v16qi __attribute__ ((__vector_size__ (16)));
+  static const v16qi search = { '\n', '\r', '?', '\\' };
+
+  uintptr_t si = (uintptr_t)s;
+  uintptr_t index;
+
+  /* Check for unaligned input.  */
+  if (si & 15)
+    {
+      if (__builtin_expect (end - s < 16, 0)
+	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
+	{
+	  /* There are less than 16 bytes left in the buffer, and less
+	     than 16 bytes left on the page.  Reading 16 bytes at this
+	     point might generate a spurious page fault.  Defer to the
+	     SSE2 implementation, which already handles alignment.  */
+	  return search_line_sse2 (s, end);
+	}
+
+      /* ??? The builtin doesn't understand that the PCMPESTRI read from
+	 memory need not be aligned.  */
+      __asm ("%vpcmpestri $0, (%1), %2"
+	     : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
+      if (__builtin_expect (index < 16, 0))
+	goto found;
+
+      /* Advance the pointer to an aligned address.  We will re-scan a
+	 few bytes, but we no longer need care for reading past the
+	 end of a page, since we're guaranteed a match.  */
+      s = (const uchar *)((si + 16) & -16);
+    }
+
+  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
+     in inline assembly, we can make proper use of the flags set.  */
+  __asm (      "sub $16, %1\n"
+	"	.balign 16\n"
+	"0:	add $16, %1\n"
+	"	%vpcmpestri $0, (%1), %2\n"
+	"	jnc 0b"
+	: "=&c"(index), "+r"(s)
+	: "x"(search), "a"(4), "d"(16));
+
+ found:
+  return s + index;
+}
+
+/* Check the CPU capabilities.  */
+
+#include "../gcc/config/i386/cpuid.h"
+
+typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
+static search_line_fast_type search_line_fast;
+
+static void __attribute__((constructor))
+init_vectorized_lexer (void)
+{
+  unsigned dummy, ecx = 0, edx = 0;
+  search_line_fast_type impl = search_line_acc_char;
+  int minimum = 0;
+
+#if defined(__SSE4_2__)
+  minimum = 3;
+#elif defined(__SSE2__)
+  minimum = 2;
+#elif defined(__SSE__) || defined(__3dNOW_A__)
+  minimum = 1;
+#endif
+
+  if (minimum == 3)
+    impl = search_line_sse42;
+  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
+    {
+      if (minimum == 3 || (ecx & bit_SSE4_2))
+        impl = search_line_sse42;
+      else if (minimum == 2 || (edx & bit_SSE2))
+	impl = search_line_sse2;
+      else if (minimum == 1 || (edx & bit_SSE))
+	impl = search_line_mmx;
+    }
+  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
+    {
+      if (minimum == 1 || edx & bit_3DNOWP)
+	impl = search_line_mmx;
+    }
+
+  search_line_fast = impl;
+}
+
+#elif defined(__GNUC__) && defined(__ALTIVEC__)
+
+/* A vection of the fast scanner using AltiVec vectorized byte compares.  */
+/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
+   so we can't compile this function without -maltivec on the command line
+   (or implied by some other switch).  */
+
+static const uchar *
+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef __attribute__((altivec(vector))) unsigned char vc;
+
+  const vc repl_nl = {
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+  };
+  const vc repl_cr = {
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+  };
+  const vc repl_bs = {
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+  };
+  const vc repl_qm = {
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+  };
+  const vc ones = {
+    -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1,
+  };
+  const vc zero = { 0 };
+
+  vc data, mask, t;
+
+  /* Altivec loads automatically mask addresses with -16.  This lets us
+     issue the first load as early as possible.  */
+  data = __builtin_vec_ld(0, (const vc *)s);
+
+  /* Discard bytes before the beginning of the buffer.  Do this by
+     beginning with all ones and shifting in zeros according to the
+     mis-alignment.  The LVSR instruction pulls the exact shift we
+     want from the address.  */
+  mask = __builtin_vec_lvsr(0, s);
+  mask = __builtin_vec_perm(zero, ones, mask);
+  data &= mask;
+
+  /* While altivec loads mask addresses, we still need to align S so
+     that the offset we compute at the end is correct.  */
+  s = (const uchar *)((uintptr_t)s & -16);
+
+  /* Main loop processing 16 bytes at a time.  */
+  goto start;
+  do
+    {
+      vc m_nl, m_cr, m_bs, m_qm;
+
+      s += 16;
+      data = __builtin_vec_ld(0, (const vc *)s);
+
+    start:
+      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
+      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
+      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
+      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
+      t = (m_nl | m_cr) | (m_bs | m_qm);
+
+      /* T now contains 0xff in bytes for which we matched one of the relevant
+	 characters.  We want to exit the loop if any byte in T is non-zero.
+	 Below is the expansion of vec_any_ne(t, zero).  */
+    }
+  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
+
+  {
+#define N  (sizeof(vc) / sizeof(long))
+
+    typedef char check_count[(N == 2 || N == 4) * 2 - 1];
+    union {
+      vc v;
+      unsigned long l[N];
+    } u;
+    unsigned long l, i = 0;
+
+    u.v = t;
+
+    /* Find the first word of T that is non-zero.  */
+    switch (N)
+      {
+      case 4:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+      case 2:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i];
+      }
+
+    /* L now contains 0xff in bytes for which we matched one of the
+       relevant characters.  We can find the byte index by finding
+       its bit index and dividing by 8.  */
+    l = __builtin_clzl(l) >> 3;
+    return s + l;
+
+#undef N
+  }
+}
+
+#else
+
+/* We only have one accellerated alternative.  Use a direct call so that
+   we encourage inlining.  */
+
+#define search_line_fast  search_line_acc_char
+
+#endif
+
 /* Returns with a logical line that contains no escaped newlines or
   trigraphs.  This is a time-critical inner loop.  */
 void
@ -109,82 +634,91 @@ _cpp_clean_line (cpp_reader *pfile)
  buffer->cur_note = buffer->notes_used = 0;
  buffer->cur = buffer->line_base = buffer->next_line;
  buffer->need_line = false;
-  s = buffer->next_line - 1;
+  s = buffer->next_line;

  if (!buffer->from_stage3)
    {
      const uchar *pbackslash = NULL;

-      /* Short circuit for the common case of an un-escaped line with
+      /* Fast path.  This is the common case of an un-escaped line with
 	 no trigraphs.  The primary win here is by not writing any
 	 data back to memory until we have to.  */
-      for (;;)
+      while (1)
 	{
-	  c = *++s;
-	  if (__builtin_expect (c == '\n', false)
-	      || __builtin_expect (c == '\r', false))
+	  /* Perform an optimized search for \n, \r, \\, ?.  */
+	  s = search_line_fast (s, buffer->rlimit);
+
+	  c = *s;
+	  if (c == '\\')
 	    {
-	      d = (uchar *) s;
-
-	      if (__builtin_expect (s == buffer->rlimit, false))
-		goto done;
-
-	      /* DOS line ending? */
-	      if (__builtin_expect (c == '\r', false)
-		  && s[1] == '\n')
-		{
-		  s++;
-		  if (s == buffer->rlimit)
-		    goto done;
-		}
-
-	      if (__builtin_expect (pbackslash == NULL, true))
-		goto done;
-
-	      /* Check for escaped newline.  */
-	      p = d;
-	      while (is_nvspace (p[-1]))
-		p--;
-	      if (p - 1 != pbackslash)
-		goto done;
-
-	      /* Have an escaped newline; process it and proceed to
-		 the slow path.  */
-	      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
-	      d = p - 2;
-	      buffer->next_line = p - 1;
-	      break;
+	      /* Record the location of the backslash and continue.  */
+	      pbackslash = s++;
 	    }
-	  if (__builtin_expect (c == '\\', false))
-	    pbackslash = s;
-	  else if (__builtin_expect (c == '?', false)
-		   && __builtin_expect (s[1] == '?', false)
+	  else if (__builtin_expect (c == '?', 0))
+	    {
+	      if (__builtin_expect (s[1] == '?', false)
 		   && _cpp_trigraph_map[s[2]])
-	    {
-	      /* Have a trigraph.  We may or may not have to convert
-		 it.  Add a line note regardless, for -Wtrigraphs.  */
-	      add_line_note (buffer, s, s[2]);
-	      if (CPP_OPTION (pfile, trigraphs))
 		{
-		  /* We do, and that means we have to switch to the
-		     slow path.  */
-		  d = (uchar *) s;
-		  *d = _cpp_trigraph_map[s[2]];
-		  s += 2;
-		  break;
+		  /* Have a trigraph.  We may or may not have to convert
+		     it.  Add a line note regardless, for -Wtrigraphs.  */
+		  add_line_note (buffer, s, s[2]);
+		  if (CPP_OPTION (pfile, trigraphs))
+		    {
+		      /* We do, and that means we have to switch to the
+		         slow path.  */
+		      d = (uchar *) s;
+		      *d = _cpp_trigraph_map[s[2]];
+		      s += 2;
+		      goto slow_path;
+		    }
 		}
+	      /* Not a trigraph.  Continue on fast-path.  */
+	      s++;
 	    }
+	  else
+	    break;
 	}

+      /* This must be \r or \n.  We're either done, or we'll be forced
+	 to write back to the buffer and continue on the slow path.  */
+      d = (uchar *) s;

-      for (;;)
+      if (__builtin_expect (s == buffer->rlimit, false))
+	goto done;
+
+      /* DOS line ending? */
+      if (__builtin_expect (c == '\r', false) && s[1] == '\n')
+	{
+	  s++;
+	  if (s == buffer->rlimit)
+	    goto done;
+	}
+
+      if (__builtin_expect (pbackslash == NULL, true))
+	goto done;
+
+      /* Check for escaped newline.  */
+      p = d;
+      while (is_nvspace (p[-1]))
+	p--;
+      if (p - 1 != pbackslash)
+	goto done;
+
+      /* Have an escaped newline; process it and proceed to
+	 the slow path.  */
+      add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
+      d = p - 2;
+      buffer->next_line = p - 1;
+
+    slow_path:
+      while (1)
 	{
 	  c = *++s;
 	  *++d = c;

 	  if (c == '\n' || c == '\r')
 	    {
-		  /* Handle DOS line endings.  */
+	      /* Handle DOS line endings.  */
 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 		s++;
 	      if (s == buffer->rlimit)
@ -215,9 +749,8 @@ _cpp_clean_line (cpp_reader *pfile)
    }
  else
    {
-      do
+      while (*s != '\n' && *s != '\r')
 	s++;
-      while (*s != '\n' && *s != '\r');
      d = (uchar *) s;

      /* Handle DOS line endings.  */
--- a/libcpp/system.h
+++ b/libcpp/system.h
@ -29,6 +29,9 @@ along with GCC; see the file COPYING3.  If not see
 #ifdef HAVE_STDDEF_H
 # include <stddef.h>
 #endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif

 #include <stdio.h>