This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] vectorized string functions

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Wed, 11 Jul 2012 17:10:07 +0200
Subject: [PATCH] vectorized string functions
Now I am almost done with vectorized implementation of string functions.
I use single loop to get faster implementation of *len, *chr and *str 
functions. 

Main functionality is in file string/loop.h. In loop I do matching
based on operations defined in vector.h,sse.h,arit.h files.

It is easy to do analogous files for widecharacter functions.

There are variants using arithmetic in string directory and for variants
using sse in sysdeps/x86_64/multiarch. To make patch shorter sse
variants are generated by running script sysdeps/x86_64/multiarch/gen_stub

For strlen see my previous patch with optimized version
For strchr a patch like strlen can be provided.
In strnlen a weak_alias does not compile rtld.

A strcasestr is not completely done.
I could get almost same speed as strstr if I precalculate
 per-locale tables as in file calc_tolower_cls.

In arithmetic version a finding first nonzero bit is tricky as bits are
ordered 0,8,16...1,9,17...2,10,18...

---
 string/arit.h                                  |  132 ++++++
 string/loop.h                                  |  144 ++++++
 string/memchr.c                                |  152 +-------
 string/memmem.c                                |   78 +----
 string/memrchr.c                               |  165 +-------
 string/rawmemchr.c                             |  143 +------
 string/str-two-way.h                           |  428 ------------------
 string/strcasestr.c                            |   99 +----
 string/strchr.c                                |  187 +--------
 string/strchr.h                                |   53 +++
 string/strchrnul.c                             |  142 +------
 string/strlen.c                                |  109 +-----
 string/strlen.h                                |   21 +
 string/strnlen.c                               |  166 +-------
 string/strrchr.c                               |   49 +--
 string/strstr.c                                |   88 +----
 string/strstr.h                                |  242 +++++++++++
 string/strstr_vec.h                            |   34 ++
 string/vector.h                                |   84 ++++
 sysdeps/x86_64/memchr.S                        |  311 -------------
 sysdeps/x86_64/multiarch/Makefile              |   40 ++-
 sysdeps/x86_64/multiarch/gen_stub              |  102 +++++
 sysdeps/x86_64/multiarch/strcasestr-c.c        |   16 -
 sysdeps/x86_64/multiarch/strcasestr-nonascii.c |   49 --
 sysdeps/x86_64/multiarch/strcasestr.c          |    7 -
 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S |    3 -
 sysdeps/x86_64/multiarch/strnlen.S             |   54 ---
 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S |  555 ------------------------
 sysdeps/x86_64/multiarch/strrchr.S             |  281 ------------
 sysdeps/x86_64/multiarch/strstr-c.c            |   15 -
 sysdeps/x86_64/multiarch/strstr.c              |  384 ----------------
 sysdeps/x86_64/sse.h                           |  104 +++++
 sysdeps/x86_64/strchrnul.S                     |   62 ---
 sysdeps/x86_64/strnlen.S                       |   63 ---
 sysdeps/x86_64/strrchr.S                       |   80 ----
 35 files changed, 997 insertions(+), 3645 deletions(-)
 create mode 100644 string/arit.h
 create mode 100644 string/loop.h
 delete mode 100644 string/str-two-way.h
 create mode 100644 string/strchr.h
 create mode 100644 string/strlen.h
 create mode 100644 string/strstr.h
 create mode 100644 string/strstr_vec.h
 create mode 100644 string/vector.h
 delete mode 100644 sysdeps/x86_64/memchr.S
 create mode 100755 sysdeps/x86_64/multiarch/gen_stub
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr.c
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
 delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
 delete mode 100644 sysdeps/x86_64/multiarch/strstr.c
 create mode 100644 sysdeps/x86_64/sse.h
 delete mode 100644 sysdeps/x86_64/strchrnul.S
 delete mode 100644 sysdeps/x86_64/strnlen.S
 delete mode 100644 sysdeps/x86_64/strrchr.S

diff --git a/string/arit.h b/string/arit.h
new file mode 100644
index 0000000..475dc59
--- /dev/null
+++ b/string/arit.h
@@ -0,0 +1,132 @@
+#include <stdint.h>
+#include "endian.h"
+
+#define unroll 4
+#if     __WORDSIZE == 64
+typedef uint64_t tp_vector;
+typedef uint64_t tp_mask;
+#elif   __WORDSIZE == 32
+typedef uint32_t tp_vector;
+typedef uint32_t tp_mask;
+#endif
+
+#define PREFETCH(x)
+
+const tp_vector ONES=((~((tp_vector)0))/255);
+const tp_vector HIGH_BIT=(((~((tp_vector)0))/255)*0x80);
+
+SI tp_mask get_mask(tp_vector x){  return  x&HIGH_BIT; }
+SI int NONZERO_MASK(tp_vector x){ return get_mask(x)!=0; }
+
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+SI tp_mask bit_i(int i){            return ((tp_mask) 1)<<(8*(i%BYTES_AT_ONCE)+(i/BYTES_AT_ONCE) );}
+#elif __BYTE_ORDER == __BIG_ENDIAN
+SI tp_mask bit_i(int i){            return ((tp_mask) 1)<<(8*(BYTES_AT_ONCE-1-i%BYTES_AT_ONCE)+(i/BYTES_AT_ONCE) );}
+#endif
+
+SI tp_mask forget_first_bit(tp_mask t,int y){return t-bit_i(y);}
+#ifdef CALCULATE_MASK
+SI int  calculate_mask_before_after(){ int i,j;
+  printf("static tp_mask kill_before[]={");
+   for(j=0;j<8*BYTES_AT_ONCE;j++){
+    tp_mask mask=0;
+    for(i=j;i<8*BYTES_AT_ONCE;i++) mask|=bit_i(i);
+    printf("0x%llx,",mask);
+  }
+  printf("0};\n");
+  printf("static tp_mask kill_after[]={");
+   for(j=0;j<8*BYTES_AT_ONCE;j++){
+    tp_mask mask=0;
+    for(i=0;i<=j;i++) mask|=bit_i(i);
+    printf("0x%llx,",mask);
+  }
+  printf("0};\n");
+}
+#endif
+#if __WORDSIZE == 32
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]={0xffffffff,0xfeffffff,0xfefeffff,0xfefefeff,0xfefefefe,0xfcfefefe,0xfcfcfefe,0xfcfcfcfe,0xfcfcfcfc,0xf8fcfcfc,0xf8f8fcfc,0xf8f8f8fc,0xf8f8f8f8,0xf0f8f8f8,0xf0f0f8f8,0xf0f0f0f8,0xf0f0f0f0,0xe0f0f0f0,0xe0e0f0f0,0xe0e0e0f0,0xe0e0e0e0,0xc0e0e0e0,0xc0c0e0e0,0xc0c0c0e0,0xc0c0c0c0,0x80c0c0c0,0x8080c0c0,0x808080c0,0x80808080,0x808080,0x8080,0x80,0};
+static tp_mask kill_after[]={0x1000000,0x1010000,0x1010100,0x1010101,0x3010101,0x3030101,0x3030301,0x3030303,0x7030303,0x7070303,0x7070703,0x7070707,0xf070707,0xf0f0707,0xf0f0f07,0xf0f0f0f,0x1f0f0f0f,0x1f1f0f0f,0x1f1f1f0f,0x1f1f1f1f,0x3f1f1f1f,0x3f3f1f1f,0x3f3f3f1f,0x3f3f3f3f,0x7f3f3f3f,0x7f7f3f3f,0x7f7f7f3f,0x7f7f7f7f,0xff7f7f7f,0xffff7f7f,0xffffff7f,0xffffffff,0};
+#else
+static tp_mask kill_before[]={0xffffffff,0xfeffffff,0xfefeffff,0xfefefeff,0xfefefefe,0xfcfefefe,0xfcfcfefe,0xfcfcfcfe,0xfcfcfcfc,0xf8fcfcfc,0xf8f8fcfc,0xf8f8f8fc,0xf8f8f8f8,0xf0f8f8f8,0xf0f0f8f8,0xf0f0f0f8,0xf0f0f0f0,0xe0f0f0f0,0xe0e0f0f0,0xe0e0e0f0,0xe0e0e0e0,0xc0e0e0e0,0xc0c0e0e0,0xc0c0c0e0,0xc0c0c0c0,0x80c0c0c0,0x8080c0c0,0x808080c0,0x80808080,0x808080,0x8080,0x80,0};
+static tp_mask kill_after[]={0x1000000,0x1010000,0x1010100,0x1010101,0x3010101,0x3030101,0x3030301,0x3030303,0x7030303,0x7070303,0x7070703,0x7070707,0xf070707,0xf0f0707,0xf0f0f07,0xf0f0f0f,0x1f0f0f0f,0x1f1f0f0f,0x1f1f1f0f,0x1f1f1f1f,0x3f1f1f1f,0x3f3f1f1f,0x3f3f3f1f,0x3f3f3f3f,0x7f3f3f3f,0x7f7f3f3f,0x7f7f7f3f,0x7f7f7f7f,0xff7f7f7f,0xffff7f7f,0xffffff7f,0xffffffff,0};
+#endif
+#elif __WORDSIZE == 64
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static tp_mask kill_before[]={0xffffffffffffffff,0xfffffffffffffffe,0xfffffffffffffefe,0xfffffffffffefefe,0xfffffffffefefefe,0xfffffffefefefefe,0xfffffefefefefefe,0xfffefefefefefefe,0xfefefefefefefefe,0xfefefefefefefefc,0xfefefefefefefcfc,0xfefefefefefcfcfc,0xfefefefefcfcfcfc,0xfefefefcfcfcfcfc,0xfefefcfcfcfcfcfc,0xfefcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcfc,0xfcfcfcfcfcfcfcf8,0xfcfcfcfcfcfcf8f8,0xfcfcfcfcfcf8f8f8,0xfcfcfcfcf8f8f8f8,0xfcfcfcf8f8f8f8f8,0xfcfcf8f8f8f8f8f8,0xfcf8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8f8,0xf8f8f8f8f8f8f8f0,0xf8f8f8f8f8f8f0f0,0xf8f8f8f8f8f0f0f0,0xf8f8f8f8f0f0f0f0,0xf8f8f8f0f0f0f0f0,0xf8f8f0f0f0f0f0f0,0xf8f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0e0,0xf0f0f0f0f0f0e0e0,0xf0f0f0f0f0e0e0e0,0xf0f0f0f0e0e0e0e0,0xf0f0f0e0e0e0e0e0,0xf0f0e0e0e0e0e0e0,0xf0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0e0,0xe0e0e0e0e0e0e0c0,0xe0e0e0e0e0e0c0c0,0xe0e0e0e0e0c0c0c0,0xe0e0e0e0c0c0c0c0,0xe0e0e0c0c0c0c0c0,0xe0e0c0c0c0c0c0c0,0xe0c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c0c0,0xc0c0c0c0c0c0c080,0xc0c0c0c0c0c08080,0xc0c0c0c0c0808080,0xc0c0c0c080808080,0xc0c0c08080808080,0xc0c0808080808080,0xc080808080808080,0x8080808080808080,0x8080808080808000,0x8080808080800000,0x8080808080000000,0x8080808000000000,0x8080800000000000,0x8080000000000000,0x8000000000000000,0};
+static tp_mask kill_after[]={0x1,0x101,0x10101,0x1010101,0x101010101,0x10101010101,0x1010101010101,0x101010101010101,0x101010101010103,0x101010101010303,0x101010101030303,0x101010103030303,0x101010303030303,0x101030303030303,0x103030303030303,0x303030303030303,0x303030303030307,0x303030303030707,0x303030303070707,0x303030307070707,0x303030707070707,0x303070707070707,0x307070707070707,0x707070707070707,0x70707070707070f,0x707070707070f0f,0x7070707070f0f0f,0x70707070f0f0f0f,0x707070f0f0f0f0f,0x7070f0f0f0f0f0f,0x70f0f0f0f0f0f0f,0xf0f0f0f0f0f0f0f,0xf0f0f0f0f0f0f1f,0xf0f0f0f0f0f1f1f,0xf0f0f0f0f1f1f1f,0xf0f0f0f1f1f1f1f,0xf0f0f1f1f1f1f1f,0xf0f1f1f1f1f1f1f,0xf1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f1f,0x1f1f1f1f1f1f1f3f,0x1f1f1f1f1f1f3f3f,0x1f1f1f1f1f3f3f3f,0x1f1f1f1f3f3f3f3f,0x1f1f1f3f3f3f3f3f,0x1f1f3f3f3f3f3f3f,0x1f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f3f,0x3f3f3f3f3f3f3f7f,0x3f3f3f3f3f3f7f7f,0x3f3f3f3f3f7f7f7f,0x3f3f3f3f7f7f7f7f,0x3f3f3f7f7f7f7f7f,0x3f3f7f7f7f7f7f7f,0x3f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7f7f,0x7f7f7f7f7f7f7fff,0x7f7f7f7f7f7fffff,0x7f7f7f7f7fffffff,0x7f7f7f7fffffffff,0x7f7f7fffffffffff,0x7f7fffffffffffff,0x7fffffffffffffff,0xffffffffffffffff,0};
+#elif __BYTE_ORDER == __BIG_ENDIAN
+static tp_mask kill_before[]={0xffffffffffffffff,0xfeffffffffffffff,0xfefeffffffffffff,0xfefefeffffffffff,0xfefefefeffffffff,0xfefefefefeffffff,0xfefefefefefeffff,0xfefefefefefefeff,0xfefefefefefefefe,0xfcfefefefefefefe,0xfcfcfefefefefefe,0xfcfcfcfefefefefe,0xfcfcfcfcfefefefe,0xfcfcfcfcfcfefefe,0xfcfcfcfcfcfcfefe,0xfcfcfcfcfcfcfcfe,0xfcfcfcfcfcfcfcfc,0xf8fcfcfcfcfcfcfc,0xf8f8fcfcfcfcfcfc,0xf8f8f8fcfcfcfcfc,0xf8f8f8f8fcfcfcfc,0xf8f8f8f8f8fcfcfc,0xf8f8f8f8f8f8fcfc,0xf8f8f8f8f8f8f8fc,0xf8f8f8f8f8f8f8f8,0xf0f8f8f8f8f8f8f8,0xf0f0f8f8f8f8f8f8,0xf0f0f0f8f8f8f8f8,0xf0f0f0f0f8f8f8f8,0xf0f0f0f0f0f8f8f8,0xf0f0f0f0f0f0f8f8,0xf0f0f0f0f0f0f0f8,0xf0f0f0f0f0f0f0f0,0xe0f0f0f0f0f0f0f0,0xe0e0f0f0f0f0f0f0,0xe0e0e0f0f0f0f0f0,0xe0e0e0e0f0f0f0f0,0xe0e0e0e0e0f0f0f0,0xe0e0e0e0e0e0f0f0,0xe0e0e0e0e0e0e0f0,0xe0e0e0e0e0e0e0e0,0xc0e0e0e0e0e0e0e0,0xc0c0e0e0e0e0e0e0,0xc0c0c0e0e0e0e0e0,0xc0c0c0c0e0e0e0e0,0xc0c0c0c0c0e0e0e0,0xc0c0c0c0c0c0e0e0,0xc0c0c0c0c0c0c0e0,0xc0c0c0c0c0c0c0c0,0x80c0c0c0c0c0c0c0,0x8080c0c0c0c0c0c0,0x808080c0c0c0c0c0,0x80808080c0c0c0c0,0x8080808080c0c0c0,0x808080808080c0c0,0x80808080808080c0,0x8080808080808080,0x80808080808080,0x808080808080,0x8080808080,0x80808080,0x808080,0x8080,0x80,0};
+static tp_mask kill_after[]={0x100000000000000,0x101000000000000,0x101010000000000,0x101010100000000,0x101010101000000,0x101010101010000,0x101010101010100,0x101010101010101,0x301010101010101,0x303010101010101,0x303030101010101,0x303030301010101,0x303030303010101,0x303030303030101,0x303030303030301,0x303030303030303,0x703030303030303,0x707030303030303,0x707070303030303,0x707070703030303,0x707070707030303,0x707070707070303,0x707070707070703,0x707070707070707,0xf07070707070707,0xf0f070707070707,0xf0f0f0707070707,0xf0f0f0f07070707,0xf0f0f0f0f070707,0xf0f0f0f0f0f0707,0xf0f0f0f0f0f0f07,0xf0f0f0f0f0f0f0f,0x1f0f0f0f0f0f0f0f,0x1f1f0f0f0f0f0f0f,0x1f1f1f0f0f0f0f0f,0x1f1f1f1f0f0f0f0f,0x1f1f1f1f1f0f0f0f,0x1f1f1f1f1f1f0f0f,0x1f1f1f1f1f1f1f0f,0x1f1f1f1f1f1f1f1f,0x3f1f1f1f1f1f1f1f,0x3f3f1f1f1f1f1f1f,0x3f3f3f1f1f1f1f1f,0x3f3f3f3f1f1f1f1f,0x3f3f3f3f3f1f1f1f,0x3f3f3f3f3f3f1f1f,0x3f3f3f3f3f3f3f1f,0x3f3f3f3f3f3f3f3f,0x7f3f3f3f3f3f3f3f,0x7f7f3f3f3f3f3f3f,0x7f7f7f3f3f3f3f3f,0x7f7f7f7f3f3f3f3f,0x7f7f7f7f7f3f3f3f,0x7f7f7f7f7f7f3f3f,0x7f7f7f7f7f7f7f3f,0x7f7f7f7f7f7f7f7f,0xff7f7f7f7f7f7f7f,0xffff7f7f7f7f7f7f,0xffffff7f7f7f7f7f,0xffffffff7f7f7f7f,0xffffffffff7f7f7f,0xffffffffffff7f7f,0xffffffffffffff7f,0xffffffffffffffff,0};
+#endif
+#endif
+SI tp_mask forget_before(tp_mask x,int y){return x&((y>=PARA) ? 0 : kill_before[y]);}
+SI tp_mask forget_after( tp_mask x,int y){return x&((y<0)     ? 0 : kill_after[ y]);}
+
+
+SI tp_mask first_bit(tp_mask t,int y){ 
+  while (!(t&bit_i(y))) y++;
+  return y;
+}
+BIN_OP(XOR,x^y)
+BIN_OP(OR,x|y)
+BIN_OP(AND,x&y)
+BIN_OP(ANDNOT,x&(~y))
+UN_OP(TEST_ZERO,(AND(~(OR(x,HIGH_BIT)-ONES),~(x))))
+BIN_OP(TEST_EQ,TEST_ZERO(XOR(x,y)));
+
+#define SHIFT_DOWN(x,y) ((x)>>(8*(y)))
+#define SHIFT_UP(x,y)   ((x)<<(8*(y)))
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n)))))
+
+SI tp_vector BROADCAST(uchar c){    return ONES*c; }
+SI tp_vector BROADCAST_ZERO(void){  return 0;      }
+
+#ifdef DEBUG
+void inspect_mask(tp_mask m){int i;
+  for(i=0;i<PARA;i++) printf(m&bit_i(i) ?  "1" : "0");
+  printf("\n");
+}
+#endif
+
+
+/* TODO implement TEST_RANGE for generic parallel_tolower
+SI tp_vector TEST_RANGE(tp_vector v,uchar from,uchar to){
+	tp_vector fv=BROADCAST(-127-from);
+	v=_mm_add_epi8(v,fv);
+	tp_vector tv=BROADCAST(-127+to-from+1);
+	return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m){tp_mask mask; 
+	tp_vector high_bit=BROADCAST(128);
+  tp_vector l= AND(TEST_RANGE(m,'A','Z'),high_bit);
+	m=OR(m,_mm_srli_epi64(l,2));
+	if ((mask=get_mask(m))){int i;
+    while(mask){ i=first_bit(mask); mask=forget_first_bit(mask,i);
+			((uchar*)&m)[i]=tolower(((uchar*)&m)[i]);
+    }
+	}
+	return m;
+}
+*/
+
+
+SI tp_vector parallel_tolower(tp_vector m){
+  int i;tp_vector r;
+  for(i=0;i<sizeof(tp_vector);i++)
+    ((uchar*)&r)[i]=tolower_fixed[((uchar*)&m)[i]];
+  return r;
+}
+
+SI tp_vector LOAD(uchar *x){return (*((tp_vector*)(x)));}
+#define LOAD_UNALIGNED LOAD
+
+#if unroll==1
+  #define AGREGATE_MASK    mask0
+#elif unroll==2
+  #define AGREGATE_MASK       ((mask0>>7)|(mask1>>6))
+#elif unroll==4
+  #define AGREGATE_MASK   ((mask0>>7)|(mask1>>6))|((mask2>>5)|(mask3>>4))
+#endif
diff --git a/string/loop.h b/string/loop.h
new file mode 100644
index 0000000..b1e649e
--- /dev/null
+++ b/string/loop.h
@@ -0,0 +1,144 @@
+/* basic string search loop. To use it define macros below and include this file.
+  TEST_CODE(so,sn)  given consecutive sequence so,sn of bytes  you should produce an
+                    vector. For bytes with highest bit set to 1 a loop invokes macro
+                    LOOP_BODY(p) where p is coresponding byte in sn.
+  LOOP_BODY(p)      see above
+  DETECT_END(p)     When byte p is reached call macro    LOOP_END(p)
+  DETECT_ZERO_BYTE  When first zero byte is reached call LOOP_END(p)
+  LOOP_END(p)       see above
+
+  CAN_SKIP          You have to define skip_to variable. Then a loop will not call
+                    LOOP_BODY(p) when p<skip_to. A LOOP_END condition will still be processed.
+
+  This file should be included inside function. A loop uses local variable s as matched string.
+  Note that implementation by callback is complicated by fact that you usualy need a closure to 
+       share arguments.
+*/
+
+#ifdef DETECT_ZERO_BYTE
+  #define _DETECT_ZERO_BYTE mvec= OR(mvec,TEST_ZERO(sz));
+  #define _TEST_ZERO_BYTE (*p==0)
+#else
+  #define _DETECT_ZERO_BYTE 
+  #define _TEST_ZERO_BYTE 0 
+#endif
+#ifdef DETECT_END
+  #define _DETECT_END(u) (DETECT_END<=s2+u*BYTES_AT_ONCE)
+  if  (DETECT_END == s){
+    uchar UNUSED *p=s;
+    LOOP_END(p);
+  }
+#else
+  #define     DETECT_END  ((uchar*)NULL)
+  #define _DETECT_END(u)  0
+#endif
+
+
+   #define TEST(u) \
+     mvec=vzero;\
+     so=sn;\
+     sn=sz=LOAD(s2+u*BYTES_AT_ONCE);\
+     mvec    = TEST_CODE(so,sn); \
+     _DETECT_ZERO_BYTE;\
+     mvec##u = mvec;
+
+
+   int  i;
+   tp_vector vzero=BROADCAST(0);
+   tp_vector sn,so,sz;
+   int s_offset; uchar* s2;
+   sn=vzero;
+   ALIGN(s,unroll);
+   tp_vector mvec; 
+   tp_mask mask, UNUSED zmask;
+  #undef ACTION
+  #define ACTION(x) tp_vector mvec##x; tp_mask mask##x;
+  DO_ACTION;
+  #undef ACTION
+  #define ACTION(x) TEST(x)
+   DO_ACTION;
+  #undef ACTION
+  #define ACTION(x) mask##x=get_mask(mvec##x);
+   DO_ACTION;
+   mask=AGREGATE_MASK;
+   mask=forget_before(mask,s_offset);
+   if (mask||_DETECT_END(unroll)) goto test;
+   start:;
+   while(1){
+     s2+=PARA;
+     PREFETCH(s2+prefetch*CACHE_LINE_SIZE);
+    #undef ACTION
+    #define ACTION(x) TEST(x)
+     DO_ACTION;
+
+     if(NONZERO_MASK(AGREGATE_VECTOR)||_DETECT_END(unroll)){
+       /* on x64 or is destructive operation
+          in case of strlen it is faster to recalculate 
+          mvec0,mvec2 than move them to separate registers.*/
+      #undef ACTION
+      #define ACTION(x) mask##x=get_mask(mvec##x);
+       DO_ACTION;
+       mask=AGREGATE_MASK;
+       goto test;
+     }
+   }
+   test:; /*we need this flow otherwise gcc would duplicate this fragment.*/
+   int end=0;
+#ifdef CAN_SKIP
+   /* detect zero byte so it cannot be skipped.*/
+#ifdef DETECT_ZERO_BYTE
+    #define ZTEST(u) \
+     sz=LOAD(s2+u*BYTES_AT_ONCE);\
+     mvec=vzero;\
+     _DETECT_ZERO_BYTE;\
+     mvec##u=mvec;
+
+     #undef ACTION
+     #define ACTION(x) ZTEST(x)
+     DO_ACTION;
+     #undef ACTION
+     #define ACTION(x) mask##x=get_mask(mvec##x);
+     DO_ACTION;
+     zmask=AGREGATE_MASK;
+     if (s>s2)
+       zmask=forget_before(zmask,s_offset);
+
+     if(zmask) end = first_bit(zmask,0)+1;
+#endif
+     if(skip_to>s2)
+       mask=forget_before(mask,skip_to-s2);
+#endif
+     if (_DETECT_END(unroll)){/*we need to handle case when end is at start of next page here*/
+       end = min(DETECT_END-s2-1,end ? (end-1) : 64)+1;
+     }
+     if (end){
+      mask=forget_after(mask,end-1);
+     }
+     i=0;
+     while(mask){ i=first_bit(mask,i);
+        uchar UNUSED *p=s2+i;
+        if(__builtin_expect(_TEST_ZERO_BYTE,0)){
+          LOOP_END(p)
+        }
+        LOOP_BODY(p)
+#ifdef CAN_SKIP
+        mask=forget_before(mask,skip_to-s2);
+#else
+        mask=forget_first_bit(mask,i);
+#endif
+     }
+     if(end){
+       uchar UNUSED *p=DETECT_END;
+       LOOP_END(p);
+     }
+
+   goto start;
+
+
+#undef CAN_SKIP
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef ACTION
+#undef DETECT_END
+#undef _DETECT_END
diff --git a/string/memchr.c b/string/memchr.c
index 22637cf..7f3537d 100644
--- a/string/memchr.c
+++ b/string/memchr.c
@@ -29,7 +29,6 @@
 
 #if defined _LIBC
 # include <string.h>
-# include <memcopy.h>
 #endif
 
 #if HAVE_STDLIB_H || defined _LIBC
@@ -40,12 +39,6 @@
 # include <limits.h>
 #endif
 
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
 #include <sys/types.h>
 #if HAVE_BP_SYM_H || defined _LIBC
 #include <bp-sym.h>
@@ -56,152 +49,15 @@
 #undef memchr
 #undef __memchr
 
-/* Search no more than N bytes of S for C.  */
-__ptr_t
-__memchr (s, c_in, n)
-     const __ptr_t s;
-     int c_in;
-     size_t n;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n, ++char_ptr)
-    if (*char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
-#endif
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++ ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
+#ifndef MEMCHR
+#define MEMCHR __memchr
 #endif
-	}
 
-      n -= sizeof (longword);
-    }
+#define AS_MEMCHR
+#include "strchr.h"
 
-  char_ptr = (const unsigned char *) longword_ptr;
 
-  while (n-- > 0)
-    {
-      if (*char_ptr == c)
-	return (__ptr_t) char_ptr;
-      else
-	++char_ptr;
-    }
 
-  return 0;
-}
 #ifdef weak_alias
 weak_alias (__memchr, BP_SYM (memchr))
 #endif
diff --git a/string/memmem.c b/string/memmem.c
index 625c9cf..f224a8e 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -1,77 +1,7 @@
-/* Copyright (C) 1991,92,93,94,96,97,98,2000,2004,2008 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* This particular implementation was written by Eric Blake, 2008.  */
-
-#ifndef _LIBC
-# include <config.h>
+#ifndef MEMMEM
+#define MEMMEM memmem
 #endif
 
-/* Specification of memmem.  */
-#include <string.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
-#define RETURN_TYPE void *
-#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
-#include "str-two-way.h"
-
-#undef memmem
-
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
-void *
-memmem (const void *haystack_start, size_t haystack_len,
-	const void *needle_start, size_t needle_len)
-{
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
-
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
-
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (__builtin_expect (haystack_len < needle_len, 0))
-    return NULL;
-
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || __builtin_expect (needle_len == 1, 0))
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
-    }
-  else
-    return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-}
+#define AS_MEMMEM
+#include "strstr.h"
 
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/memrchr.c b/string/memrchr.c
index 2826f13..f736897 100644
--- a/string/memrchr.c
+++ b/string/memrchr.c
@@ -30,23 +30,6 @@
 #undef __ptr_t
 #define __ptr_t void *
 
-#if defined _LIBC
-# include <string.h>
-# include <memcopy.h>
-#endif
-
-#if defined HAVE_LIMITS_H || defined _LIBC
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-# define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
-
 #undef __memrchr
 #undef memrchr
 
@@ -54,155 +37,13 @@
 # define __memrchr memrchr
 #endif
 
-/* Search no more than N bytes of S for C.  */
-__ptr_t
 #ifndef MEMRCHR
-__memrchr
-#else
-MEMRCHR
-#endif
-     (s, c_in, n)
-     const __ptr_t s;
-     int c_in;
-     size_t n;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the last few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s + n;
-       n > 0 && ((unsigned long int) char_ptr
-		 & (sizeof (longword) - 1)) != 0;
-       --n)
-    if (*--char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (const unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
+#define MEMRCHR __memrchr
 #endif
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
-#endif
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (n >= sizeof (longword))
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *--longword_ptr ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) longword_ptr;
-
-#if LONG_MAX > 2147483647
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-#endif
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	}
-
-      n -= sizeof (longword);
-    }
-
-  char_ptr = (const unsigned char *) longword_ptr;
-
-  while (n-- > 0)
-    {
-      if (*--char_ptr == c)
-	return (__ptr_t) char_ptr;
-    }
+#define AS_MEMRCHR
+#include "strchr.h"
 
-  return 0;
-}
 #ifndef MEMRCHR
 # ifdef weak_alias
 weak_alias (__memrchr, memrchr)
diff --git a/string/rawmemchr.c b/string/rawmemchr.c
index 90e8c7c..7e255b7 100644
--- a/string/rawmemchr.c
+++ b/string/rawmemchr.c
@@ -29,154 +29,19 @@
 
 #if defined (_LIBC)
 # include <string.h>
-# include <memcopy.h>
 # include <stdlib.h>
 #endif
 
-#if defined (HAVE_LIMITS_H) || defined (_LIBC)
-# include <limits.h>
-#endif
-
-#define LONG_MAX_32_BITS 2147483647
-
-#ifndef LONG_MAX
-#define LONG_MAX LONG_MAX_32_BITS
-#endif
-
-#include <sys/types.h>
-
 #undef memchr
 
-
-/* Find the first occurrence of C in S.  */
-__ptr_t
-__rawmemchr (s, c_in)
-     const __ptr_t s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c)
-      return (__ptr_t) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-
-  if (sizeof (longword) != 4 && sizeof (longword) != 8)
-    abort ();
-
-#if LONG_MAX <= LONG_MAX_32_BITS
-  magic_bits = 0x7efefeff;
-#else
-  magic_bits = ((unsigned long int) 0x7efefefe << 32) | 0xfefefeff;
-#endif
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-#if LONG_MAX > LONG_MAX_32_BITS
-  charmask |= charmask << 32;
+#ifndef RAWMEMCHR
+#define RAWMEMCHR __rawmemchr
 #endif
 
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (1)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
+#define AS_RAWMEMCHR
+#include "strchr.h"
 
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
 
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
 
-      longword = *longword_ptr++ ^ charmask;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (cp[0] == c)
-	    return (__ptr_t) cp;
-	  if (cp[1] == c)
-	    return (__ptr_t) &cp[1];
-	  if (cp[2] == c)
-	    return (__ptr_t) &cp[2];
-	  if (cp[3] == c)
-	    return (__ptr_t) &cp[3];
-#if LONG_MAX > 2147483647
-	  if (cp[4] == c)
-	    return (__ptr_t) &cp[4];
-	  if (cp[5] == c)
-	    return (__ptr_t) &cp[5];
-	  if (cp[6] == c)
-	    return (__ptr_t) &cp[6];
-	  if (cp[7] == c)
-	    return (__ptr_t) &cp[7];
-#endif
-	}
-    }
-}
 libc_hidden_def (__rawmemchr)
 weak_alias (__rawmemchr, rawmemchr)
diff --git a/string/str-two-way.h b/string/str-two-way.h
deleted file mode 100644
index 1b2a8bd..0000000
--- a/string/str-two-way.h
+++ /dev/null
@@ -1,428 +0,0 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
-   Copyright (C) 2008, 2010 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Written by Eric Blake <ebb9@byu.net>, 2008.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* Before including this file, you need to include <string.h> (and
-   <config.h> before that, if not part of libc), and define:
-     RESULT_TYPE             A macro that expands to the return type.
-     AVAILABLE(h, h_l, j, n_l)
-			     A macro that returns nonzero if there are
-			     at least N_L bytes left starting at H[J].
-			     H is 'unsigned char *', H_L, J, and N_L
-			     are 'size_t'; H_L is an lvalue.  For
-			     NUL-terminated searches, H_L can be
-			     modified each iteration to avoid having
-			     to compute the end of H up front.
-
-  For case-insensitivity, you may optionally define:
-     CMP_FUNC(p1, p2, l)     A macro that returns 0 iff the first L
-			     characters of P1 and P2 are equal.
-     CANON_ELEMENT(c)        A macro that canonicalizes an element right after
-			     it has been fetched from one of the two strings.
-			     The argument is an 'unsigned char'; the result
-			     must be an 'unsigned char' as well.
-
-  This file undefines the macros documented above, and defines
-  LONG_NEEDLE_THRESHOLD.
-*/
-
-#include <limits.h>
-#include <stdint.h>
-
-/* We use the Two-Way string matching algorithm, which guarantees
-   linear complexity with constant space.  Additionally, for long
-   needles, we also use a bad character shift table similar to the
-   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
-   performance.
-
-   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
-   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
-*/
-
-/* Point at which computing a bad-byte shift table is likely to be
-   worthwhile.  Small needles should not compute a table, since it
-   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
-   speedup no greater than a factor of NEEDLE_LEN.  The larger the
-   needle, the better the potential performance gain.  On the other
-   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
-   memory required for the table is prohibitive.  */
-#if CHAR_BIT < 10
-# define LONG_NEEDLE_THRESHOLD 32U
-#else
-# define LONG_NEEDLE_THRESHOLD SIZE_MAX
-#endif
-
-#ifndef MAX
-# define MAX(a, b) ((a < b) ? (b) : (a))
-#endif
-
-#ifndef CANON_ELEMENT
-# define CANON_ELEMENT(c) c
-#endif
-#ifndef CMP_FUNC
-# define CMP_FUNC memcmp
-#endif
-
-/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
-   Return the index of the first byte in the right half, and set
-   *PERIOD to the global period of the right half.
-
-   The global period of a string is the smallest index (possibly its
-   length) at which all remaining bytes in the string are repetitions
-   of the prefix (the last repetition may be a subset of the prefix).
-
-   When NEEDLE is factored into two halves, a local period is the
-   length of the smallest word that shares a suffix with the left half
-   and shares a prefix with the right half.  All factorizations of a
-   non-empty NEEDLE have a local period of at least 1 and no greater
-   than NEEDLE_LEN.
-
-   A critical factorization has the property that the local period
-   equals the global period.  All strings have at least one critical
-   factorization with the left half smaller than the global period.
-
-   Given an ordered alphabet, a critical factorization can be computed
-   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
-   larger of two ordered maximal suffixes.  The ordered maximal
-   suffixes are determined by lexicographic comparison of
-   periodicity.  */
-static size_t
-critical_factorization (const unsigned char *needle, size_t needle_len,
-			size_t *period)
-{
-  /* Index of last byte of left half, or SIZE_MAX.  */
-  size_t max_suffix, max_suffix_rev;
-  size_t j; /* Index into NEEDLE for current candidate suffix.  */
-  size_t k; /* Offset into current period.  */
-  size_t p; /* Intermediate period.  */
-  unsigned char a, b; /* Current comparison bytes.  */
-
-  /* Invariants:
-     0 <= j < NEEDLE_LEN - 1
-     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
-     min(max_suffix, max_suffix_rev) < global period of NEEDLE
-     1 <= p <= global period of NEEDLE
-     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
-     1 <= k <= p
-  */
-
-  /* Perform lexicographic search.  */
-  max_suffix = SIZE_MAX;
-  j = 0;
-  k = p = 1;
-  while (j + k < needle_len)
-    {
-      a = CANON_ELEMENT (needle[j + k]);
-      b = CANON_ELEMENT (needle[max_suffix + k]);
-      if (a < b)
-	{
-	  /* Suffix is smaller, period is entire prefix so far.  */
-	  j += k;
-	  k = 1;
-	  p = j - max_suffix;
-	}
-      else if (a == b)
-	{
-	  /* Advance through repetition of the current period.  */
-	  if (k != p)
-	    ++k;
-	  else
-	    {
-	      j += p;
-	      k = 1;
-	    }
-	}
-      else /* b < a */
-	{
-	  /* Suffix is larger, start over from current location.  */
-	  max_suffix = j++;
-	  k = p = 1;
-	}
-    }
-  *period = p;
-
-  /* Perform reverse lexicographic search.  */
-  max_suffix_rev = SIZE_MAX;
-  j = 0;
-  k = p = 1;
-  while (j + k < needle_len)
-    {
-      a = CANON_ELEMENT (needle[j + k]);
-      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
-      if (b < a)
-	{
-	  /* Suffix is smaller, period is entire prefix so far.  */
-	  j += k;
-	  k = 1;
-	  p = j - max_suffix_rev;
-	}
-      else if (a == b)
-	{
-	  /* Advance through repetition of the current period.  */
-	  if (k != p)
-	    ++k;
-	  else
-	    {
-	      j += p;
-	      k = 1;
-	    }
-	}
-      else /* a < b */
-	{
-	  /* Suffix is larger, start over from current location.  */
-	  max_suffix_rev = j++;
-	  k = p = 1;
-	}
-    }
-
-  /* Choose the longer suffix.  Return the first byte of the right
-     half, rather than the last byte of the left half.  */
-  if (max_suffix_rev + 1 < max_suffix + 1)
-    return max_suffix + 1;
-  *period = p;
-  return max_suffix_rev + 1;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
-   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
-   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
-   Performance is guaranteed to be linear, with an initialization cost
-   of 2 * NEEDLE_LEN comparisons.
-
-   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
-   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
-   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
-   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
-static RETURN_TYPE
-two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
-		      const unsigned char *needle, size_t needle_len)
-{
-  size_t i; /* Index into current byte of NEEDLE.  */
-  size_t j; /* Index into current window of HAYSTACK.  */
-  size_t period; /* The period of the right half of needle.  */
-  size_t suffix; /* The index of the right half of needle.  */
-
-  /* Factor the needle into two halves, such that the left half is
-     smaller than the global period, and the right half is
-     periodic (with a period as large as NEEDLE_LEN - suffix).  */
-  suffix = critical_factorization (needle, needle_len, &period);
-
-  /* Perform the search.  Each iteration compares the right half
-     first.  */
-  if (CMP_FUNC (needle, needle + period, suffix) == 0)
-    {
-      /* Entire needle is periodic; a mismatch can only advance by the
-	 period, so use memory to avoid rescanning known occurrences
-	 of the period.  */
-      size_t memory = 0;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Scan for matches in right half.  */
-	  i = MAX (suffix, memory);
-	  while (i < needle_len && (CANON_ELEMENT (needle[i])
-				    == CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i + 1 < memory + 1)
-		return (RETURN_TYPE) (haystack + j);
-	      /* No match, so remember how many repetitions of period
-		 on the right half were scanned.  */
-	      j += period;
-	      memory = needle_len - period;
-	    }
-	  else
-	    {
-	      j += i - suffix + 1;
-	      memory = 0;
-	    }
-	}
-    }
-  else
-    {
-      /* The two halves of needle are distinct; no extra memory is
-	 required, and any mismatch results in a maximal shift.  */
-      period = MAX (suffix, needle_len - suffix) + 1;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Scan for matches in right half.  */
-	  i = suffix;
-	  while (i < needle_len && (CANON_ELEMENT (needle[i])
-				    == CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
-				       == CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i == SIZE_MAX)
-		return (RETURN_TYPE) (haystack + j);
-	      j += period;
-	    }
-	  else
-	    j += i - suffix + 1;
-	}
-    }
-  return NULL;
-}
-
-/* Return the first location of non-empty NEEDLE within HAYSTACK, or
-   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
-   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
-   Performance is guaranteed to be linear, with an initialization cost
-   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
-
-   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
-   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
-   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
-   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
-   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
-   sublinear performance is not possible.  */
-static RETURN_TYPE
-two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
-		     const unsigned char *needle, size_t needle_len)
-{
-  size_t i; /* Index into current byte of NEEDLE.  */
-  size_t j; /* Index into current window of HAYSTACK.  */
-  size_t period; /* The period of the right half of needle.  */
-  size_t suffix; /* The index of the right half of needle.  */
-  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
-
-  /* Factor the needle into two halves, such that the left half is
-     smaller than the global period, and the right half is
-     periodic (with a period as large as NEEDLE_LEN - suffix).  */
-  suffix = critical_factorization (needle, needle_len, &period);
-
-  /* Populate shift_table.  For each possible byte value c,
-     shift_table[c] is the distance from the last occurrence of c to
-     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
-     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
-  for (i = 0; i < 1U << CHAR_BIT; i++)
-    shift_table[i] = needle_len;
-  for (i = 0; i < needle_len; i++)
-    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
-
-  /* Perform the search.  Each iteration compares the right half
-     first.  */
-  if (CMP_FUNC (needle, needle + period, suffix) == 0)
-    {
-      /* Entire needle is periodic; a mismatch can only advance by the
-	 period, so use memory to avoid rescanning known occurrences
-	 of the period.  */
-      size_t memory = 0;
-      size_t shift;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Check the last byte first; if it does not match, then
-	     shift to the next possible match location.  */
-	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
-	  if (0 < shift)
-	    {
-	      if (memory && shift < period)
-		{
-		  /* Since needle is periodic, but the last period has
-		     a byte out of place, there can be no match until
-		     after the mismatch.  */
-		  shift = needle_len - period;
-		}
-	      memory = 0;
-	      j += shift;
-	      continue;
-	    }
-	  /* Scan for matches in right half.  The last byte has
-	     already been matched, by virtue of the shift table.  */
-	  i = MAX (suffix, memory);
-	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len - 1 <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i + 1 < memory + 1)
-		return (RETURN_TYPE) (haystack + j);
-	      /* No match, so remember how many repetitions of period
-		 on the right half were scanned.  */
-	      j += period;
-	      memory = needle_len - period;
-	    }
-	  else
-	    {
-	      j += i - suffix + 1;
-	      memory = 0;
-	    }
-	}
-    }
-  else
-    {
-      /* The two halves of needle are distinct; no extra memory is
-	 required, and any mismatch results in a maximal shift.  */
-      size_t shift;
-      period = MAX (suffix, needle_len - suffix) + 1;
-      j = 0;
-      while (AVAILABLE (haystack, haystack_len, j, needle_len))
-	{
-	  /* Check the last byte first; if it does not match, then
-	     shift to the next possible match location.  */
-	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
-	  if (0 < shift)
-	    {
-	      j += shift;
-	      continue;
-	    }
-	  /* Scan for matches in right half.  The last byte has
-	     already been matched, by virtue of the shift table.  */
-	  i = suffix;
-	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
-					== CANON_ELEMENT (haystack[i + j])))
-	    ++i;
-	  if (needle_len - 1 <= i)
-	    {
-	      /* Scan for matches in left half.  */
-	      i = suffix - 1;
-	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
-				       == CANON_ELEMENT (haystack[i + j])))
-		--i;
-	      if (i == SIZE_MAX)
-		return (RETURN_TYPE) (haystack + j);
-	      j += period;
-	    }
-	  else
-	    j += i - suffix + 1;
-	}
-    }
-  return NULL;
-}
-
-#undef AVAILABLE
-#undef CANON_ELEMENT
-#undef CMP_FUNC
-#undef RETURN_TYPE
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 9e1bde9..1111a07 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -1,54 +1,3 @@
-/* Return the offset of one string within another.
-   Copyright (C) 1994, 1996-2000, 2004, 2008, 2009, 2010
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/*
- * My personal strstr() implementation that beats most other algorithms.
- * Until someone tells me otherwise, I assume that this is the
- * fastest implementation of strstr() in C.
- * I deliberately chose not to comment it.  You should have at least
- * as much fun trying to understand it, as I had to write it :-).
- *
- * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de	*/
-
-#if HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-/* Specification.  */
-#include <string.h>
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <strings.h>
-
-#define TOLOWER(Ch) (isupper (Ch) ? tolower (Ch) : (Ch))
-
-/* Two-Way algorithm.  */
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
-#define CANON_ELEMENT(c) TOLOWER (c)
-#define CMP_FUNC(p1, p2, l)				\
-  __strncasecmp ((const char *) (p1), (const char *) (p2), l)
-#include "str-two-way.h"
-
 #undef strcasestr
 #undef __strcasestr
 
@@ -56,52 +5,12 @@
 #define STRCASESTR __strcasestr
 #endif
 
+#define AS_STRCASESTR
+#include "strstr.h"
 
-/* Find the first occurrence of NEEDLE in HAYSTACK, using
-   case-insensitive comparison.  This function gives unspecified
-   results in multibyte locales.  */
-char *
-STRCASESTR (const char *haystack_start, const char *needle_start)
-{
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    {
-      ok &= (TOLOWER ((unsigned char) *haystack)
-	     == TOLOWER ((unsigned char) *needle));
-      haystack++;
-      needle++;
-    }
-  if (*needle)
-    return NULL;
-  if (ok)
-    return (char *) haystack_start;
-  needle_len = needle - needle_start;
-  haystack = haystack_start + 1;
-  haystack_len = needle_len - 1;
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle_start,
-				 needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle_start,
-			      needle_len);
-}
-
-#undef LONG_NEEDLE_THRESHOLD
 
 #ifndef NO_ALIAS
 weak_alias (__strcasestr, strcasestr)
 #endif
+
+
diff --git a/string/strchr.c b/string/strchr.c
index 9d18b7e..6f57c7d 100644
--- a/string/strchr.c
+++ b/string/strchr.c
@@ -1,187 +1,10 @@
-/* Copyright (C) 1991,1993-1997,1999,2000,2003,2006
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Based on strlen implementation by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se) and
-   bug fix and commentary by Jim Blandy (jimb@ai.mit.edu);
-   adaptation to strchr suggested by Dick Karpinski (dick@cca.ucsf.edu),
-   and implemented by Roland McGrath (roland@ai.mit.edu).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <memcopy.h>
-#include <stdlib.h>
-
-#undef strchr
-
-/* Find the first occurrence of C in S.  */
-char *
-strchr (s, c_in)
-     const char *s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c)
-      return (void *) char_ptr;
-    else if (*char_ptr == '\0')
-      return NULL;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
+#ifndef STRCHR
+#define STRCHR strchr
+#endif
 
-	  if (*cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (*++cp == c)
-	    return (char *) cp;
-	  else if (*cp == '\0')
-	    return NULL;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	      if (*++cp == c)
-		return (char *) cp;
-	      else if (*cp == '\0')
-		return NULL;
-	    }
-	}
-    }
+#define AS_STRCHR
+#include "strchr.h"
 
-  return NULL;
-}
 
 #ifdef weak_alias
 #undef index
diff --git a/string/strchr.h b/string/strchr.h
new file mode 100644
index 0000000..ba8022c
--- /dev/null
+++ b/string/strchr.h
@@ -0,0 +1,53 @@
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define TEST_CODE(so,sn) TEST_EQ(sn,vc)
+
+#if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+  #define DETECT_ZERO_BYTE
+#endif
+#if defined(AS_MEMCHR) || defined(AS_MEMRCHR)
+  #define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+#endif
+
+
+#ifdef AS_STRCHR
+  #define LOOP_END(p) return NULL;
+  uchar* STRCHR(   const uchar *s, int c )
+#endif
+#ifdef AS_MEMCHR
+  #define LOOP_END(p) return NULL;
+  uchar* MEMCHR(   const uchar *s, int c , size_t ss)
+#endif
+
+#if defined(AS_STRRCHR) || defined(AS_MEMRCHR)
+  #define LOOP_BODY(p) r=p;
+  #define LOOP_END(p) return r;
+  #ifdef AS_STRRCHR
+    uchar* STRRCHR(   const uchar *s, int c)
+  #endif
+  #ifdef AS_MEMRCHR
+    uchar* MEMRCHR(   const uchar *s, int c , size_t ss)
+  #endif
+#else
+  #define LOOP_BODY(p) return p;
+#endif
+
+#ifdef AS_STRCHRNUL
+  #define LOOP_END(p) return p;
+  uchar* STRCHRNUL(const uchar *s, int c )
+#endif
+#ifdef AS_RAWMEMCHR
+  #define LOOP_END(p) /*cannot happen*/
+  uchar* RAWMEMCHR(const uchar *s, int c , size_t ss)
+#endif
+{
+  #if defined(AS_STRCHR) || defined(AS_STRRCHR) || defined(AS_STRCHRNUL)
+    if(__builtin_expect(c==0,0)) return s+strlen(s);
+  #endif
+  uchar UNUSED *r = NULL;
+  tp_vector vc=BROADCAST(c);
+  #include "loop.h"
+}
diff --git a/string/strchrnul.c b/string/strchrnul.c
index 0db5e23..6e6992f 100644
--- a/string/strchrnul.c
+++ b/string/strchrnul.c
@@ -21,149 +21,17 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
 #include <stdlib.h>
 
 #undef __strchrnul
 #undef strchrnul
 
-/* Find the first occurrence of C in S or the final NUL byte.  */
-char *
-__strchrnul (s, c_in)
-     const char *s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
 
-  c = (unsigned char) c_in;
+#ifndef STRCHRNUL
+#define STRCHRNUL __strchrnul
+#endif
 
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c || *char_ptr == '\0')
-      return (void *) char_ptr;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
-    {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
-    }
-
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (*cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	    }
-	}
-    }
-
-  /* This should never happen.  */
-  return NULL;
-}
+#define AS_STRCHRNUL
+#include "strchr.h"
 
 weak_alias (__strchrnul, strchrnul)
diff --git a/string/strlen.c b/string/strlen.c
index 5c1efda..9e917d1 100644
--- a/string/strlen.c
+++ b/string/strlen.c
@@ -1,106 +1,7 @@
-/* Copyright (C) 1991,1993,1997,2000,2003,2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Written by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se);
-   commentary by Jim Blandy (jimb@ai.mit.edu).
+#ifndef STRLEN
+#define STRLEN strlen
+#endif
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+#define AS_STRLEN
+#include "strlen.h"
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <stdlib.h>
-
-#undef strlen
-
-/* Return the length of the null-terminated string STR.  Scan for
-   the null terminator quickly by testing four bytes at a time.  */
-size_t
-strlen (str)
-     const char *str;
-{
-  const char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == '\0')
-      return char_ptr - str;
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
-
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
-    {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
-    }
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
-    {
-      longword = *longword_ptr++;
-
-      if (((longword - lomagic) & ~longword & himagic) != 0)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  if (cp[0] == 0)
-	    return cp - str;
-	  if (cp[1] == 0)
-	    return cp - str + 1;
-	  if (cp[2] == 0)
-	    return cp - str + 2;
-	  if (cp[3] == 0)
-	    return cp - str + 3;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (cp[4] == 0)
-		return cp - str + 4;
-	      if (cp[5] == 0)
-		return cp - str + 5;
-	      if (cp[6] == 0)
-		return cp - str + 6;
-	      if (cp[7] == 0)
-		return cp - str + 7;
-	    }
-	}
-    }
-}
-libc_hidden_builtin_def (strlen)
diff --git a/string/strlen.h b/string/strlen.h
new file mode 100644
index 0000000..0fd669d
--- /dev/null
+++ b/string/strlen.h
@@ -0,0 +1,21 @@
+#define unroll 4
+#define prefetch 8
+
+#include "vector.h"
+
+#define DETECT_ZERO_BYTE
+#define TEST_CODE(so,sn) vzero
+#define LOOP_BODY(p) return p-s;
+
+#ifdef AS_STRNLEN
+  #define DETECT_END ((s+ss>=s) ? s+ss : ((uchar*)((long)-1)))
+  #define LOOP_END(p)  return p-s;
+  size_t STRNLEN( uchar *s , size_t ss )
+#endif
+#ifdef AS_STRLEN
+   #define LOOP_END(p)  return p-s;
+  size_t STRLEN(  uchar *s )
+#endif
+{
+  #include "loop.h"
+}
diff --git a/string/strnlen.c b/string/strnlen.c
index 65b9aa6..e90941d 100644
--- a/string/strnlen.c
+++ b/string/strnlen.c
@@ -1,165 +1,11 @@
-/* Find the length of STRING, but scan at most MAXLEN characters.
-   Copyright (C) 1991, 1993, 1997, 2000, 2001, 2005, 2011 Free Software Foundation, Inc.
-   Contributed by Jakub Jelinek <jakub@redhat.com>.
-
-   Based on strlen written by Torbjorn Granlund (tege@sics.se),
-   with help from Dan Sahlin (dan@sics.se);
-   commentary by Jim Blandy (jimb@ai.mit.edu).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 2.1 of the
-   License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If
-   not, see <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include <stdlib.h>
-
-/* Find the length of S, but scan at most MAXLEN characters.  If no
-   '\0' terminator is found in that many characters, return MAXLEN.  */
-
-#ifdef STRNLEN
-# define __strnlen STRNLEN
+#ifndef STRNLEN
+#define STRNLEN __strnlen
 #endif
 
-size_t
-__strnlen (const char *str, size_t maxlen)
-{
-  const char *char_ptr, *end_ptr = str + maxlen;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, himagic, lomagic;
-
-  if (maxlen == 0)
-    return 0;
-
-  if (__builtin_expect (end_ptr < str, 0))
-    end_ptr = (const char *) ~0UL;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = str; ((unsigned long int) char_ptr
-			& (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == '\0')
-      {
-	if (char_ptr > end_ptr)
-	  char_ptr = end_ptr;
-	return char_ptr - str;
-      }
-
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
-
-  longword_ptr = (unsigned long int *) char_ptr;
-
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
+#define AS_STRNLEN
+#include "strlen.h"
 
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
-
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  himagic = 0x80808080L;
-  lomagic = 0x01010101L;
-  if (sizeof (longword) > 4)
-    {
-      /* 64-bit version of the magic.  */
-      /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-      himagic = ((himagic << 16) << 16) | himagic;
-      lomagic = ((lomagic << 16) << 16) | lomagic;
-    }
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  while (longword_ptr < (unsigned long int *) end_ptr)
-    {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.  */
-
-      longword = *longword_ptr++;
-
-      if ((longword - lomagic) & himagic)
-	{
-	  /* Which of the bytes was the zero?  If none of them were, it was
-	     a misfire; continue the search.  */
-
-	  const char *cp = (const char *) (longword_ptr - 1);
-
-	  char_ptr = cp;
-	  if (cp[0] == 0)
-	    break;
-	  char_ptr = cp + 1;
-	  if (cp[1] == 0)
-	    break;
-	  char_ptr = cp + 2;
-	  if (cp[2] == 0)
-	    break;
-	  char_ptr = cp + 3;
-	  if (cp[3] == 0)
-	    break;
-	  if (sizeof (longword) > 4)
-	    {
-	      char_ptr = cp + 4;
-	      if (cp[4] == 0)
-		break;
-	      char_ptr = cp + 5;
-	      if (cp[5] == 0)
-		break;
-	      char_ptr = cp + 6;
-	      if (cp[6] == 0)
-		break;
-	      char_ptr = cp + 7;
-	      if (cp[7] == 0)
-		break;
-	    }
-	}
-      char_ptr = end_ptr;
-    }
-
-  if (char_ptr > end_ptr)
-    char_ptr = end_ptr;
-  return char_ptr - str;
-}
-#ifndef STRNLEN
+#ifndef NO_ALIAS
 weak_alias (__strnlen, strnlen)
 #endif
-libc_hidden_def (strnlen)
+
diff --git a/string/strrchr.c b/string/strrchr.c
index a986ff9..62f92d5 100644
--- a/string/strrchr.c
+++ b/string/strrchr.c
@@ -1,49 +1,12 @@
-/* Copyright (C) 1991, 1995, 1996, 1997, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-
-#undef strrchr
-
-/* Find the last occurrence of C in S.  */
-char *
-strrchr (const char *s, int c)
-{
-  register const char *found, *p;
-
-  c = (unsigned char) c;
-
-  /* Since strchr is fast, we use it rather than the obvious loop.  */
-
-  if (c == '\0')
-    return strchr (s, '\0');
-
-  found = NULL;
-  while ((p = strchr (s, c)) != NULL)
-    {
-      found = p;
-      s = p + 1;
-    }
+#ifndef STRRCHR
+#define STRRCHR strrchr
+#endif
 
-  return (char *) found;
-}
+#define AS_STRRCHR
+#include "strchr.h"
 
 #ifdef weak_alias
-#undef rindex
+#undef index
 weak_alias (strrchr, rindex)
 #endif
 libc_hidden_builtin_def (strrchr)
diff --git a/string/strstr.c b/string/strstr.c
index 10e6fdc..2bca454 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -1,91 +1,7 @@
-/* Return the offset of one string within another.
-   Copyright (C) 1994,1996,1997,2000,2001,2003,2008,2009
-   Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* This particular implementation was written by Eric Blake, 2008.  */
-
-#ifndef _LIBC
-# include <config.h>
-#endif
-
-/* Specification of strstr.  */
-#include <string.h>
-
-#include <stdbool.h>
-
-#ifndef _LIBC
-# define __builtin_expect(expr, val)   (expr)
-#endif
-
-#define RETURN_TYPE char *
-#define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
-#include "str-two-way.h"
-
-#undef strstr
-
 #ifndef STRSTR
 #define STRSTR strstr
 #endif
 
-/* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
-   if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
-   HAYSTACK.  */
-char *
-STRSTR (const char *haystack_start, const char *needle_start)
-{
-  const char *haystack = haystack_start;
-  const char *needle = needle_start;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    ok &= *haystack++ == *needle++;
-  if (*needle)
-    return NULL;
-  if (ok)
-    return (char *) haystack_start;
-
-  /* Reduce the size of haystack using strchr, since it has a smaller
-     linear coefficient than the Two-Way algorithm.  */
-  needle_len = needle - needle_start;
-  haystack = strchr (haystack_start + 1, *needle_start);
-  if (!haystack || __builtin_expect (needle_len == 1, 0))
-    return (char *) haystack;
-  needle -= needle_len;
-  haystack_len = (haystack > haystack_start + needle_len ? 1
-		  : needle_len + haystack_start - haystack);
-
-  /* Perform the search.  Abstract memory is considered to be an array
-     of 'unsigned char' values, not an array of 'char' values.  See
-     ISO C 99 section 6.2.6.1.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) needle, needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) needle, needle_len);
-}
-libc_hidden_builtin_def (strstr)
+#define AS_STRSTR
+#include "strstr.h"
 
-#undef LONG_NEEDLE_THRESHOLD
diff --git a/string/strstr.h b/string/strstr.h
new file mode 100644
index 0000000..49654db
--- /dev/null
+++ b/string/strstr.h
@@ -0,0 +1,242 @@
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <string.h>
+
+#define unroll 4
+#define prefetch 8
+#define small_treshold 128
+
+#include "vector.h"
+
+
+
+#ifdef AS_STRSTR
+  #define _AS_STR_CASESTR_MEM(x,y,z) x 
+#endif
+#ifdef AS_STRCASESTR
+  #define _AS_STR_CASESTR_MEM(x,y,z) y
+#endif
+#ifdef AS_MEMMEM
+  #define _AS_STR_CASESTR_MEM(x,y,z) z 
+#endif
+#define CHAR(x) _AS_STR_CASESTR_MEM(*(x),\
+                                    (isupper(*(x)) ? tolower(*(x)) : *(x)),\
+                                    *(x))
+
+/*TODO vectorize*/
+static size_t strcmp_dir(const uchar *a,const uchar *b,long no,int dir){
+  int i;
+  for(i=0;i<no && CHAR(a)==CHAR(b);i++){a+=dir;b+=dir;}
+  return i;
+}
+
+/* Two way algorithm: CROCHEMORE M., PERRIN D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+   Implementation based from http://www-igm.univ-mlv.fr/~lecroq/string/node26.html
+
+   We use vectorized algorithm to. find occurences of fragment pointed by n+check
+
+   On occurence we do step of two way algorithm and tell finder in skip_to variable where it should resume search.
+*/
+
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri);
+static uchar *strstr_two_way(uchar *s, uchar *s_end, uchar *n, int ns)
+{
+   int ell,   per, peri,pos;
+   two_way_preprocessing(n,ns,&per,&ell,&peri);
+   int fw,fwno,bw,bwno;
+   int check=ns-2;
+   uchar *skip_to=s+check;
+   s+=ns-2;
+
+  #define CAN_SKIP
+  #define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, parallel_tolower(x), x)
+  #define MASK_CONVERT(x) CHAR(&x)
+
+  #define LOOP_BODY(p) \
+     p -= ns - 1;\
+     pos = ell + 1;\
+     fwno = check - pos;\
+     fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+     if (fw < fwno ){\
+       p += fw + 1;\
+     } else {\
+       bwno = ell + 1;\
+       bw = strcmp_dir(n + bwno - 1, p + bwno - 1, bwno, -1);\
+       if ( bw < bwno ){\
+         p += per;\
+         if (peri){\
+           while(1){\
+             if(_AS_STR_CASESTR_MEM(0,0,p+ns>s_end)) return NULL;\
+             /*zero byte in forward check causes mismatch.*/\
+             pos = max(ell + 1, ns - per);\
+             fwno = ns - pos;\
+             fw = strcmp_dir(n + pos ,p + pos, fwno , 1);\
+             if (fw < fwno ){\
+               p += fw + 1;\
+               break ;\
+             } else {\
+               bwno = ell - (ns - per - 1);\
+               bw = strcmp_dir(n + ell, p + ell, bwno, -1);\
+               if ( bw < bwno ){\
+                 p += per;\
+               } else {\
+                 return p;\
+               }\
+             }\
+           }\
+         }\
+       } else {\
+         return p;\
+       }\
+     }\
+     skip_to = p + (ns - 1);
+
+  #include "strstr_vec.h"
+}
+
+#ifdef AS_STRCASESTR
+
+#endif
+static uchar *strstr_vec(uchar *s,uchar *s_end,uchar *n,int ns){
+#ifdef AS_STRCASESTR
+  if(!calc_tolower_class) calc_tolower_cls(); /*TODO recalculate when locale changes. */
+  #define CASECHECK(u) (tolower_class_no[u]==1 || (tolower_class_no[u]==2 && (tolower_class[u][0]^tolower_class[u][1])==32))
+  if (!(CASECHECK(n[ns-1]) || CASECHECK(n[ns-2])))
+    return strstr_two_way(s,s_end,n,ns);
+  #undef CASECHECK
+#endif
+  int buy=8*ns+64,rent=0; 
+  int check =  ns - _AS_STR_CASESTR_MEM(2,0,2);
+  s += ns-2;
+  tp_vector UNUSED diff=BROADCAST('A'^'a');
+  #define CASE_CONVERT(x) _AS_STR_CASESTR_MEM(x, OR(x,diff),  x)
+  #define MASK_CONVERT(x) _AS_STR_CASESTR_MEM(x, x|('A'^'a'), x)
+  #define LOOP_BODY(p) \
+    p -= ns - 1;\
+    int checked=strcmp_dir(p + check - 1,n + check - 1,check , -1); \
+    if (checked == check) return p; \
+    rent+=checked;\
+    if(buy+2*(p-s)>rent)        return strstr_two_way(p,s_end,n,ns);\
+
+    #include "strstr_vec.h"
+  }
+
+
+
+#ifdef AS_STRSTR
+  uchar *STRSTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_STRCASESTR
+  uchar *STRCASESTR(const uchar *s,const uchar *n)
+#endif
+#ifdef AS_MEMMEM
+ uchar *MEMMEM(const uchar *s,size_t ss,const uchar *n,size_t ns)
+#endif
+{
+  int buy=small_treshold,rent=0;
+  uchar *p=(uchar*)s;
+#if defined( AS_STRSTR) || defined(AS_STRCASESTR)
+/* TODO handle case when ss<ns by searching for end of n,s in parallel.*/
+  int ns=strlen( (char*)n);     
+  int ss=strnlen((char*)s,ns);
+#endif
+  if( ns > ss) return NULL;
+  if (!ns) return (uchar*) s;
+  uchar *s_end=(uchar*)((s+ss>=s) ? s+ss : ((uchar*)((long)-1)));
+/*For strstr and memmem this decreases startup cost. 
+  For strcasestr we align haystack.*/
+#define STRCHR(s,sn,c) _AS_STR_CASESTR_MEM( strchr(s,c),\
+                                            (*(s-1) ? s : NULL),\
+                                            memchr(s,c,sn))
+  int check=ns-_AS_STR_CASESTR_MEM(1,0,1);
+  int page_offset= ((long)s)%4096;
+  p += check;
+  while(1){
+    p=(uchar*) STRCHR((char*)p,s_end-p,((char*)n)[ns-1]);
+    if(!p) return NULL;
+    p -= check;
+    int checked = strcmp_dir(n, p, check, 1);
+    if (checked == check) return p;
+    rent += check + 32;
+    /*next implementation is faster but has large startup cost*/
+    if(buy - (p-s) < rent && 
+       p >= s - page_offset +BYTES_AT_ONCE){
+      /*Next implementations need two invariants.
+        First  is that string started before position that is passed.
+        Second is that p - BYTES_AT_ONCE is valid memory*/
+      return strstr_vec((uchar*)p+1,s_end,(uchar*)n,ns);
+    }
+    p++;
+    p += check;
+  }
+}
+
+
+static int maxSuf(uchar *x, int m, int *p, int invert) {
+   int ms, j, k;
+   uchar a, b;
+
+   ms = -1;
+   j = 0;
+   k = *p = 1;
+   while (j + k < m) {
+      a = CHAR(x + j + k);
+      b = CHAR(x + ms + k);
+      if (invert ? (a > b) : (a < b)) {
+         j += k;
+         k = 1;
+         *p = j - ms;
+      }
+      else
+         if (a == b)
+            if (k != *p)
+               ++k;
+            else {
+               j += *p;
+               k = 1;
+            }
+         else { /* a > b */
+            ms = j;
+            j = ms + 1;
+            k = *p = 1;
+         }
+   }
+   return(ms);
+}
+ 
+static int periodic(uchar *a,uchar *b,int siz){
+  return strcmp_dir(a,b,siz,1)==siz;
+}
+
+static void two_way_preprocessing(uchar *n,int ns,int *per2,int *ell2,int *peri){
+  int u,v,up,vp;
+  int per,ell;
+  u=maxSuf(n,ns,&up,0);
+  v=maxSuf(n,ns,&vp,1);
+  ell = (u > v) ? u :  v;
+  per = (u > v) ? up : vp;
+  *peri = periodic(n, n + per, ell + 1);
+  if (!*peri)
+    per = max(ell + 1, ns - ell - 1) + 1;
+  *per2=per;
+  *ell2=ell;
+}
+
+
+
diff --git a/string/strstr_vec.h b/string/strstr_vec.h
new file mode 100644
index 0000000..de98e4e
--- /dev/null
+++ b/string/strstr_vec.h
@@ -0,0 +1,34 @@
+  tp_vector vn0=BROADCAST(MASK_CONVERT(n[ns-1-0])); 
+  tp_vector vn1=BROADCAST(MASK_CONVERT(n[ns-1-1]));
+  tp_vector e0,e1;
+#ifdef AS_STRSTR
+  #define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_STRCASESTR
+  #define DETECT_ZERO_BYTE
+#endif
+#ifdef AS_MEMMEM
+  #define DETECT_END s_end
+#endif
+
+#ifdef USE_ARITHMETIC 
+  #define TEST_CODE(so,sn) vzero;\
+        e0   =XOR(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0);\
+        e1   =XOR(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1);\
+        mvec=TEST_ZERO(OR(e0,e1));
+#else
+  #define TEST_CODE(so,sn) vzero;\
+     sn   = CASE_CONVERT(sn);\
+     e0   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-0),vn0); \
+     e1   = TEST_EQ(CONCAT(sn,so,BYTES_AT_ONCE-1),vn1); \
+     mvec = (AND(e0,e1));
+#endif
+
+  #define LOOP_END(p) return NULL;
+  #include "loop.h"
+
+#undef TEST_CODE
+#undef LOOP_BODY
+#undef LOOP_END
+#undef CASE_CONVERT
+#undef MASK_CONVERT
diff --git a/string/vector.h b/string/vector.h
new file mode 100644
index 0000000..d10079a
--- /dev/null
+++ b/string/vector.h
@@ -0,0 +1,84 @@
+/* vectorized functions for string matching. They operate many(4,8,16,32) unsigned bytes at once, allowed operations are
+  TEST_ZERO(x)      - set highest bit of bytes that were zero to 1 and 0 otherwise.
+  TEST_EQ(x,y)      - set highest bit of bytes that are equal to 1 and 0 otherwise.
+  BROADCAST(c)      - return vector such that all bytes have value c
+  TEST_RANGE(x,y,z) - set highest bit of bytes that xi <= yi <= zi to 1 and 0 otherwise. You must satisfy condition zi-xi<128.
+  AND,OR,XOR,ANDNOT - do logic operation bytewise
+  SHIFT_UP(x,k), SHIFT_DOWN(x,k) shift vector x k bytes up/down 
+  CONCAT(xlow,xhigh,k) concatenate xlow,xhigh and return bytes from k-th.
+  In shifts and concatenation k must be constant.
+
+  To support other vector extension see sysdeps/x86_64/sse.h file.
+*/
+typedef unsigned char uchar;
+#define SI static inline
+
+#include <ctype.h>
+/*TODO this tables should be recalculated when locale changes.*/
+static uchar _tolower_class[512];
+static uchar *tolower_class[256];
+static uchar tolower_class_no[256];
+static uchar tolower_fixed[256];
+static int calc_tolower_class=0;
+SI void calc_tolower_cls(void){int i,j;  uchar *p=_tolower_class;
+  /* as POSIX tolower has undefined behaviour on nonupper characters
+     we construct table with defined behaviour.*/
+  /* second reason is that tolower call is slow because compiler spills all used xmm registers*/
+  for (i=0;i<256;i++) tolower_fixed[i] = isupper(i) ? tolower(i) : i;
+  /* calculate equivalence classes*/
+  for (i=0;i<256;i++){
+    for(j=0;j<i;j++) if(tolower_fixed[i]==tolower_fixed[j]) {
+      tolower_class_no[i]=tolower_class_no[j];
+      tolower_class[i]=tolower_class[j];
+      goto skip;
+    }
+    tolower_class[i]   =p;
+    tolower_class_no[i]=0;
+    for(j=i;j<256;j++){
+      if(tolower_fixed[i]==tolower_fixed[j]){
+        tolower_class_no[i]++;
+        *p++=j;
+      }
+    }
+    *p++=0;
+    skip:;
+  }
+  calc_tolower_class=1;
+}
+
+#define BYTES_AT_ONCE sizeof(tp_vector)
+#define PARA (BYTES_AT_ONCE*unroll)
+#define VSIZ_BYTE sizeof(tp_vector)
+#define VSIZ_BIT  (VSIZ_BYTE*8)
+#define MSIZ_BYTE sizeof(tp_mask)
+#define MSIZ_BIT  (MSIZ_BYTE*8)
+
+#define ALIGN(x,u)         s_offset=((size_t) x)%((u)*BYTES_AT_ONCE);           s2=(uchar *)(((size_t) x)&((long) (~(u*BYTES_AT_ONCE-1))));
+/*line s2=x-offset; is clearer some compilers do not know that s2 is aligned*/
+
+#define CACHE_LINE_SIZE 64
+#define UN_OP(n,e) SI tp_vector n(tp_vector x){ return e;}
+#define BIN_OP(n,e) SI tp_vector n(tp_vector x,tp_vector y){ return e;}
+#if defined( USE_SSE2) | defined(USE_SSSE3) | defined(USE_SSE4_1)
+#include "sse.h"
+#else
+#include "arit.h"
+#endif
+#undef UN_OP
+#undef BIN_OP
+
+#if unroll==1
+#define DO_ACTION ACTION(0) 
+#define AGREGATE_VECTOR  mvec0
+#elif unroll==2
+#define DO_ACTION ACTION(0) ACTION(1)
+#define  AGREGATE_VECTOR    OR(mvec0,mvec1)
+#elif unroll==4
+#define DO_ACTION ACTION(0) ACTION(1) ACTION(2) ACTION(3)
+#define AGREGATE_VECTOR OR(OR(mvec0,mvec1),OR(mvec2,mvec3))
+#endif
+
+SI int min(int x,int y){return x<y ? x : y;}
+SI int max(int x,int y){return x>y ? x : y;}
+
+#define UNUSED __attribute__((unused))
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
deleted file mode 100644
index dcc8bef..0000000
--- a/sysdeps/x86_64/memchr.S
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (C)  2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* fast SSE2 version with using pmaxub and 64 byte loop */
-
-	.text
-ENTRY(memchr)
-	movd	%rsi, %xmm1
-	mov	%rdi, %rcx
-
-	punpcklbw %xmm1, %xmm1
-	test	%rdx, %rdx
-	jz	L(return_null)
-	punpcklbw %xmm1, %xmm1
-
-	and	$63, %rcx
-	pshufd	$0, %xmm1, %xmm1
-
-	cmp	$48, %rcx
-	ja	L(crosscache)
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-
-	jnz	L(matches_1)
-	sub	$16, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	and	$15, %rcx
-	and	$-16, %rdi
-	add	%rcx, %rdx
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	jmp	L(loop_prolog)
-
-	.p2align 4
-L(crosscache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	movdqa	(%rdi), %xmm0
-
-	pcmpeqb	%xmm1, %xmm0
-/* Check if there is a match.  */
-	pmovmskb %xmm0, %eax
-/* Remove the leading bytes.  */
-	sar	%cl, %eax
-	test	%eax, %eax
-	je	L(unaligned_no_match)
-/* Check which byte is a match.  */
-	bsf	%eax, %eax
-
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	add	%rcx, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	add	%rcx, %rdx
-	sub	$16, %rdx
-	jbe	L(return_null)
-	add	$16, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	.p2align 4
-L(loop_prolog):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	add	$64, %rdi
-	pmovmskb %xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	test	$0x3f, %rdi
-	jz	L(align64_loop)
-
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	48(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-
-	add	$64, %rdi
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%rdi, %rcx
-	and	$-64, %rdi
-	and	$63, %rcx
-	add	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm0, %xmm3
-	pmaxub	%xmm2, %xmm4
-	pmaxub	%xmm3, %xmm4
-	pmovmskb %xmm4, %eax
-
-	add	$64, %rdi
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	sub	$64, %rdi
-
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-
-	pcmpeqb	48(%rdi), %xmm1
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	pmovmskb %xmm1, %eax
-	bsf	%eax, %eax
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(exit_loop):
-	add	$32, %rdx
-	jle	L(exit_loop_32)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches)
-
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb %xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb %xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	sub	$16, %rdx
-	jle	L(return_null)
-
-	pcmpeqb	48(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(exit_loop_32):
-	add	$32, %rdx
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches_1)
-	sub	$16, %rdx
-	jbe	L(return_null)
-
-	pcmpeqb	16(%rdi), %xmm1
-	pmovmskb %xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(matches0):
-	bsf	%eax, %eax
-	lea	-16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches):
-	bsf	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsf	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches32):
-	bsf	%eax, %eax
-	lea	32(%rax, %rdi), %rax
-	ret
-
-	.p2align 4
-L(matches_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	16(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches32_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(matches48_1):
-	bsf	%eax, %eax
-	sub	%rax, %rdx
-	jbe	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-END(memchr)
-
-strong_alias (memchr, __memchr)
-
-libc_hidden_builtin_def(memchr)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..c17aa68 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,24 +9,50 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+		   memmove-ssse3-back  strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3
+		   memcmp-ssse3 strchr-sse2-no-bsf
+
+sysdep_routines += strnlen strnlen_sse2 strnlen_ssse3 strnlen_sse4_1
+  CFLAGS-strnlen_ssse3.c  += -mssse3
+  CFLAGS-strnlen_sse4_1.c  += -msse4
+sysdep_routines += strstr strstr_sse2 strstr_ssse3 strstr_sse4_1
+  CFLAGS-strstr_ssse3.c  += -mssse3
+  CFLAGS-strstr_sse4_1.c  += -msse4
+sysdep_routines += strcasestr strcasestr_sse2 strcasestr_ssse3 strcasestr_sse4_1
+  CFLAGS-strcasestr_ssse3.c  += -mssse3
+  CFLAGS-strcasestr_sse4_1.c  += -msse4
+sysdep_routines += memmem memmem_sse2 memmem_ssse3 memmem_sse4_1
+  CFLAGS-memmem_ssse3.c  += -mssse3
+  CFLAGS-memmem_sse4_1.c  += -msse4
+sysdep_routines += strrchr strrchr_sse2 strrchr_ssse3 strrchr_sse4_1
+  CFLAGS-strrchr_ssse3.c  += -mssse3
+  CFLAGS-strrchr_sse4_1.c  += -msse4
+sysdep_routines += strchrnul strchrnul_sse2 strchrnul_ssse3 strchrnul_sse4_1
+  CFLAGS-strchrnul_ssse3.c  += -mssse3
+  CFLAGS-strchrnul_sse4_1.c  += -msse4
+sysdep_routines += memchr memchr_sse2 memchr_ssse3 memchr_sse4_1
+  CFLAGS-memchr_ssse3.c  += -mssse3
+  CFLAGS-memchr_sse4_1.c  += -msse4
+sysdep_routines += rawmemchr rawmemchr_sse2 rawmemchr_ssse3 rawmemchr_sse4_1
+  CFLAGS-rawmemchr_ssse3.c  += -mssse3
+  CFLAGS-rawmemchr_sse4_1.c  += -msse4
+sysdep_routines += memrchr memrchr_sse2 memrchr_ssse3 memrchr_sse4_1
+  CFLAGS-memrchr_ssse3.c  += -mssse3
+  CFLAGS-memrchr_sse4_1.c  += -msse4
+
+
 ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c   varshift
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
 endif
 endif
 
diff --git a/sysdeps/x86_64/multiarch/gen_stub b/sysdeps/x86_64/multiarch/gen_stub
new file mode 100755
index 0000000..da7cdf3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/gen_stub
@@ -0,0 +1,102 @@
+fn(){
+J=$1
+TP=$2
+ARG=$3
+ARGN=$4
+BASE=$5
+EXT=$6
+
+for I in sse2 ssse3 sse4_1; do
+F="${J}_${I}.c"
+IU=`echo $I | tr '[a-z]' '[A-Z]'`
+JU=`echo $J | tr '[a-z]' '[A-Z]'`
+echo "/*generated by gen_stub*/"  > $F
+echo "#define AS_${JU}"          >> $F
+echo "#define USE_${IU}"         >> $F
+echo "#define ${JU} __${J}_${I}" >> $F
+echo "#include \"string/${BASE}.h\""       >> $F
+done
+
+if [ -z $EXT ]; then
+FN=$J
+ALIASED=""
+else
+FN="__${J}"
+ALIASED="#ifndef NO_ALIAS
+weak_alias(${FN},${J});
+#endif"
+fi
+
+echo "/*generated by gen_stub*/"  > "${J}.c"
+
+echo "
+#include <sysdep.h>
+#ifndef _LIBC
+# include <config.h>
+#endif
+
+#if defined SHARED  && !defined NOT_IN_libc
+
+#include \"init-arch.h\"
+
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__${J}_base, __GI_${J}, __${J}_base);
+
+" >> "${J}.c"
+
+for I in sse2 ssse3 sse4_1; do
+  echo "extern ${TP} __${J}_${I}(${ARG}) attribute_hidden;" >> "${J}.c"
+done
+echo " ${TP} ${FN}(${ARG});
+${TP} __${J}_base(${ARG}){  
+  return __${J}_sse2(${ARGN}); 
+}
+libc_hidden_builtin_def (__${J}_base)
+extern __typeof (__${J}_base) __${J}_base attribute_hidden;
+
+libc_ifunc (${FN}, HAS_SSE4_1 ? __${J}_sse4_1 :   (HAS_SSE2 ? __${J}_sse2 : __${J}_base));
+#else
+
+#include \"${J}_sse2.c\"
+
+$TP ${FN}(${ARG}){
+  return __${J}_sse2(${ARGN});
+}
+#endif
+${ALIASED}
+" >> "${J}.c"
+
+  echo "sysdep_routines += ${J} ${J}_sse2 ${J}_ssse3 ${J}_sse4_1
+  CFLAGS-${J}_ssse3.c  += -mssse3
+  CFLAGS-${J}_sse4_1.c  += -msse4"
+
+}
+#fn strlen "size_t" "const char* n" "n"               strlen
+fn strnlen "size_t" "const char* n,size_t ns" "n,ns" strlen 
+
+
+fn strstr  "char *" "const char* s,const char *n"                      "s,n"       strstr
+fn strcasestr  "char *" "const char* s,const char *n"                  "s,n"       strstr ext
+fn memmem  "void *" "const void* s,size_t ss,const void *n, size_t ns" "s,ss,n,ns" strstr
+
+#fn strchr   "char *" "const char* s,int c" "s,c"  strchr
+# fails because strch expands to builtin
+
+fn strrchr   "char *" "const char* s,int c" "s,c"  strchr 
+fn strchrnul "char *" "const char* s,int c" "s,c"  strchr ext
+
+fn memchr    "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  
+fn rawmemchr "void *" "const void* s,int c" "s,c"  strchr ext
+fn memrchr   "void *" "const void* s,int c,size_t ss" "s,c,ss"  strchr  ext
+
+echo "
+#ifndef NO_ALIAS
+weak_alias(strrchr,rindex);
+#endif" >> strrchr.c
+
+
+echo "size_t __strnlen(const char* n,size_t ns){
+  return strnlen(n,ns);
+}" >> strnlen.c
+
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index 551492d..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
-	    HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
-	    0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index a1f9968..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-# include <ctype.h>
-
-
-/* Similar to __m128i_strloadu.  Convert to lower case for none-POSIX/C
-   locale.  */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
-  union
-    {
-      char b[16];
-      __m128i x;
-    } u;
-
-  for (int i = 0; i < 16; ++i)
-    if (p[i] == 0)
-      {
-	u.b[i] = 0;
-	break;
-      }
-    else
-      u.b[i] = tolower (p[i]);
-
-  return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
deleted file mode 100644
index d1cfb3b..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ /dev/null
@@ -1,7 +0,0 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
-					  const unsigned char *s2)
-  attribute_hidden;
-
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
deleted file mode 100644
index 248328d..0000000
--- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNLEN
-#define STRLEN __strnlen_sse2_no_bsf
-#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
deleted file mode 100644
index 044b910..0000000
--- a/sysdeps/x86_64/multiarch/strnlen.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/* multiple version of strnlen
-   Copyright (C) 2011 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc.  */
-#ifndef NOT_IN_libc
-
-	.text
-ENTRY(__strnlen)
-	.type	__strnlen, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strnlen_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	2f
-	leaq	__strnlen_sse2_no_bsf(%rip), %rax
-2:	ret
-END(__strnlen)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strnlen_sse2, @function; \
-	.align 16; \
-	__strnlen_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
-
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
-	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
-#endif
-
-#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index c698c94..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
-   Copyright (C) 2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
-	atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
-	movd	%rsi, %xmm1
-	pxor	%xmm2, %xmm2
-	mov	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	punpcklbw %xmm1, %xmm1
-	/* ECX has OFFSET. */
-	and	$63, %rcx
-	cmp	$48, %rcx
-	pshufd	$0, %xmm1, %xmm1
-	ja	L(crosscache)
-
-/* unaligned string. */
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %rcx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-/* Hancle unaligned string.  */
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm3, %rdx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	/* Remove the leading bytes.  */
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-	jmp	L(match_exit)
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	%cl, %cl
-	jz	L(find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(find_zero_8)
-	test	$0x01, %cl
-	jnz	L(FindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(FindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(FindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_8):
-	test	$0x10, %cl
-	jnz	L(FindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(FindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(FindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(FindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(FindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(FindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(FindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(FindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(FindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit1):
-	and	$1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jz	L(return_value)
-
-	.p2align 4
-L(match_exit):
-	test	%ah, %ah
-	jnz	L(match_exit_high)
-	mov	%al, %dl
-	and	$15 << 4, %dl
-	jnz	L(match_exit_8)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_8):
-	test	$0x80, %al
-	jnz	L(Exit8)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high):
-	mov	%ah, %dh
-	and	$15 << 4, %dh
-	jnz	L(match_exit_high_8)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high_8):
-	test	$0x80, %ah
-	jnz	L(Exit16)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit2):
-	lea	-15(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit3):
-	lea	-14(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit4):
-	lea	-13(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit6):
-	lea	-11(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit7):
-	lea	-10(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit8):
-	lea	-9(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit10):
-	lea	-7(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit11):
-	lea	-6(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit12):
-	lea	-5(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit14):
-	lea	-3(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit15):
-	lea	-2(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit16):
-	lea	-1(%rdi), %rax
-	ret
-
-/* Return NULL.  */
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	%cl, %cl
-	jz	L(prolog_find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(prolog_find_zero_8)
-	test	$0x01, %cl
-	jnz	L(PrologFindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(PrologFindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(PrologFindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_8):
-	test	$0x10, %cl
-	jnz	L(PrologFindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(PrologFindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(PrologFindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(prolog_find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(PrologFindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(PrologFindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(PrologFindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(PrologFindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(PrologFindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(PrologFindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit1):
-	and	$1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index c87d8fa..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,281 +0,0 @@
-/* strrchr with SSE4.2
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
-   the DSO.  In static binaries we need strrchr before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.text
-ENTRY(strrchr)
-	.type	strrchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strrchr_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strrchr_sse42(%rip), %rax
-	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strrchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strrchr)
-
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the last occurrence of the character c in the
-   string s:
-
-   char *strrchr (const char *s, int c);
-
-   We use 0x4a:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_MOST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   last offset.  There are 4 cases:
-
-   1. The first 16byte data element has EOS and has the byte C at the
-      last offset X.
-   2. The first 16byte data element is valid and has the byte C at the
-      last offset X.
-   3. The first 16byte data element has EOS and doesn't have the byte C.
-   4. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	  1	  0
-    2		 X	  1	  0	  0
-    3		16	  0	  1	  0
-    4		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 3 with jz which branches
-   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
-
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strrchr_sse42, @function
-__strrchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
-	movd	%esi, %xmm1
-	punpcklbw	%xmm1, %xmm1
-	movl	%edi, %esi
-	punpcklbw	%xmm1, %xmm1
-	andl	$15, %esi
-	pshufd	$0, %xmm1, %xmm1
-	movq	%rdi, %r8
-	je	L(loop)
-
-/* Handle unaligned string using psrldq.  */
-	leaq	L(psrldq_table)(%rip), %rdx
-	andq	$-16, %r8
-	movslq	(%rdx,%rsi,4),%r9
-	movdqa	(%r8), %xmm0
-	addq	%rdx, %r9
-	jmp	*%r9
-
-/* Handle unaligned string with offset 1 using psrldq.  */
-	.p2align 4
-L(psrldq_1):
-	psrldq	$1, %xmm0
-
-	.p2align 4
-L(unaligned_pcmpistri):
-	pcmpistri	$0x4a, %xmm1, %xmm0
-	jnc	L(unaligned_no_byte)
-	leaq	(%rdi,%rcx), %rax
-L(unaligned_no_byte):
-	/* Find the length of the unaligned string.  */
-	pcmpistri	$0x3a, %xmm0, %xmm0
-	movl	$16, %edx
-	subl	%esi, %edx
-	cmpl	%ecx, %edx
-	/* Return RAX if the unaligned fragment to next 16B already
-	   contain the NULL terminator.  */
-	jg	L(exit)
-	addq	$16, %r8
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	pcmpistri	$0x4a, (%r8), %xmm1
-	jbe	L(match_or_eos)
-	addq	$16, %r8
-	jmp	L(loop)
-	.p2align 4
-L(match_or_eos):
-	je	L(had_eos)
-L(match_no_eos):
-	leaq	(%r8,%rcx), %rax
-	addq	$16, %r8
-	jmp     L(loop)
-	.p2align 4
-L(had_eos):
-	jnc     L(exit)
-	leaq	(%r8,%rcx), %rax
-	.p2align 4
-L(exit):
-	ret
-
-/* Handle unaligned string with offset 15 using psrldq.  */
-	.p2align 4
-L(psrldq_15):
-	psrldq	$15, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq.  */
-	.p2align 4
-L(psrldq_14):
-	psrldq	$14, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq.  */
-	.p2align 4
-L(psrldq_13):
-	psrldq	$13, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq.  */
-	.p2align 4
-L(psrldq_12):
-	psrldq	$12, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq.  */
-	.p2align 4
-L(psrldq_11):
-	psrldq	$11, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq.  */
-	.p2align 4
-L(psrldq_10):
-	psrldq	$10, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq.  */
-	.p2align 4
-L(psrldq_9):
-	psrldq	$9, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq.  */
-	.p2align 4
-L(psrldq_8):
-	psrldq	$8, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq.  */
-	.p2align 4
-L(psrldq_7):
-	psrldq	$7, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq.  */
-	.p2align 4
-L(psrldq_6):
-	psrldq	$6, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq.  */
-	.p2align 4
-L(psrldq_5):
-	psrldq	$5, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq.  */
-	.p2align 4
-L(psrldq_4):
-	psrldq	$4, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq.  */
-	.p2align 4
-L(psrldq_3):
-	psrldq	$3, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq.  */
-	.p2align 4
-L(psrldq_2):
-	psrldq	$2, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-	cfi_endproc
-	.size	__strrchr_sse42, .-__strrchr_sse42
-
-	.section .rodata.sse4.2,"a",@progbits
-	.p2align 4
-L(psrldq_table):
-	.int	L(loop) - L(psrldq_table)
-	.int	L(psrldq_1) - L(psrldq_table)
-	.int	L(psrldq_2) - L(psrldq_table)
-	.int	L(psrldq_3) - L(psrldq_table)
-	.int	L(psrldq_4) - L(psrldq_table)
-	.int	L(psrldq_5) - L(psrldq_table)
-	.int	L(psrldq_6) - L(psrldq_table)
-	.int	L(psrldq_7) - L(psrldq_table)
-	.int	L(psrldq_8) - L(psrldq_table)
-	.int	L(psrldq_9) - L(psrldq_table)
-	.int	L(psrldq_10) - L(psrldq_table)
-	.int	L(psrldq_11) - L(psrldq_table)
-	.int	L(psrldq_12) - L(psrldq_table)
-	.int	L(psrldq_13) - L(psrldq_table)
-	.int	L(psrldq_14) - L(psrldq_table)
-	.int	L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strrchr_sse2, @function; \
-	.align 16; \
-	__strrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index b8ed316..0000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "init-arch.h"
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern char *__strstr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strstr_sse2) __strstr_sse2 attribute_hidden;
-
-libc_ifunc (strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
deleted file mode 100644
index b1b4139..0000000
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C)		tolower (C)
-# define CMPBYTE(C1, C2)	(tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C)		(C)
-# define CMPBYTE(C1, C2)	((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ORDER
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to do the scanning and string comparsion requirements of
-   sub-string match.  In the scanning phase, we process Cflag and ECX
-   index to locate the first fragment match; once the first fragment
-   match position has been identified, we do comparison of subsequent
-   string fragments until we can conclude false or true match; whe
-   n concluding a false match, we may need to repeat scanning process
-   from next relevant offset in the target string.
-
-   In the scanning phase we have 4 cases:
-   case		ECX	CFlag	ZFlag	SFlag
-    1		16	  0	  0	  0
-    2a		16	  0	  0	  1
-    2b		16	  0	  1	  0
-    2c		16	  0	  1	  1
-
-   1. No ordered-comparison match, both 16B fragments are valid, so
-      continue to next fragment.
-   2. No ordered-comparison match, there is EOS in either fragment,
-   2a. Zflg = 0, Sflg = 1, we continue
-   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
-   2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
-   In the string comparison phase, the 1st fragment match is fixed up
-   to produce ECX = 0.  Subsequent fragment compare of nonzero index
-   and no match conclude a false match.
-
-   case		ECX	CFlag	ZFlag	SFlag
-    3		 X	  1	  0	  0/1
-    4a		 0	  1	  0	  0
-    4b		 0	  1	  0	  1
-    4c		0 < X	  1	  0	  0/1
-    5		16	  0	  1	  0
-
-   3. An initial ordered-comparison fragment match, we fix up to do
-      subsequent string comparison
-   4a. Continuation of fragment comparison of a string compare.
-   4b. EOS reached in the reference string, we conclude true match and
-       return
-   4c. String compare failed if index is nonzero, we need to go back to
-       scanning
-   5.  failed string compare, go back to scanning
- */
-
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
-   If EOS occurs within less than 16B before 4KB boundary, we don't
-   cross to next page.  */
-
-static inline __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
-  if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
-    {
-      size_t offset = ((size_t) p & (16 - 1));
-      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
-      if ((bmsk >> offset) != 0)
-	return __m128i_shift_right (a, offset);
-    }
-  return _mm_loadu_si128 ((__m128i *) p);
-}
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
-   locale and other which have single-byte letters only in the ASCII
-   range.  */
-static inline __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
-			  __m128i uchigh, __m128i lcqword)
-{
-  __m128i frag = __m128i_strloadu (p, zero);
-
-  /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'.  */
-  __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
-  /* Compare if bytes are > 'A' - 1.  */
-  __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
-  /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1.  */
-  __m128i mask = _mm_and_si128 (r2, r1);
-  /* Apply lowercase bit 6 mask for above mask bytes == ff.  */
-  return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
-   algorithm) overlap for a fully populated 16B vector.
-   Input parameter: 1st 16Byte loaded from the reference string of a
-		    strstr function.
-   We don't use KMP algorithm if reference string is less than 16B.  */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
-  __m128i b = _mm_unpacklo_epi8 (s2, s2);
-  __m128i a = _mm_unpacklo_epi8 (b, b);
-  a = _mm_shuffle_epi32 (a, 0);
-  b = _mm_srli_si128 (s2, sizeof (char));
-  int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
-  /* _BitScanForward(&k1, bmsk); */
-  int k1;
-  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
-  if (!bmsk)
-    return 16;
-  else if (bmsk == 0x7fff)
-    return 1;
-  else if (!k1)
-    {
-      /* There are al least two distinct chars in s2.  If byte 0 and 1 are
-	 idential and the distinct value lies farther down, we can deduce
-	 the next byte offset to restart full compare is least no earlier
-	 than byte 3.  */
-      return 3;
-    }
-  else
-    {
-      /* Byte 1 is not degenerated to byte 0.  */
-      return k1 + 1;
-    }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
-  const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (p2[0] == '\0', 0))
-    return (char *) p1;
-
-  if (__builtin_expect (p1[0] == '\0', 0))
-    return NULL;
-
-  /* Check if p1 length is 1 byte long.  */
-  if (__builtin_expect (p1[1] == '\0', 0))
-    return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
-			!= 0, 0))
-    return __strcasestr_sse42_nonascii (s1, s2);
-
-  const __m128i uclow = _mm_set1_epi8 (0x40);
-  const __m128i uchigh = _mm_set1_epi8 (0x5b);
-  const __m128i lcqword = _mm_set1_epi8 (0x20);
-  const __m128i zero = _mm_setzero_si128 ();
-#  define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-#  define strloadu __m128i_strloadu_tolower
-#  define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
-  const __m128i zero = _mm_setzero_si128 ();
-#endif
-
-  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
-  __m128i frag1 = strloadu (p1);
-
-  __m128i frag2;
-  if (p2[1] != '\0')
-    /* p2 is > 1 byte long.  */
-    frag2 = strloadu (p2);
-  else
-    frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  if (cmp_s & cmp_c)
-    {
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
-      int len;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      p1 += cmp;
-      if ((len + cmp) <= 16)
-	return (char *) p1;
-
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      if ((len + cmp) <= 16)
-	return (char *) p1 + cmp;
-    }
-
-  if (cmp_s)
-    {
-      /* Adjust addr for 16B alginment in ensuing loop.  */
-      while (!cmp_z)
-	{
-	  p1 += cmp;
-	  /* Load up to 16 bytes of fragment.  */
-	  frag1 = strloadu (p1);
-	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
-	     once already, this time cmp will be zero and we can exit.  */
-	  if ((!cmp) & cmp_c)
-	    break;
-	}
-
-      if (!cmp_c)
-	return NULL;
-
-      /* Since s2 is less than 16 bytes, com_c is definitive
-	 determination of full match.  */
-      return (char *) p1 + cmp;
-    }
-
-  /* General case, s2 is at least 16 bytes or more.
-     First, the common case of false-match at first byte of p2.  */
-  const unsigned char *pt = NULL;
-  int kmp_fwd = 0;
-re_trace:
-  while (!cmp_c)
-    {
-      /* frag1 has null. */
-      if (cmp_z)
-	return NULL;
-
-      /* frag 1 has no null, advance 16 bytes.  */
-      p1 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, is there a partial match?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-    }
-
-  /* Next, handle initial positive match as first byte of p2.  We have
-     a partial fragment match, make full determination until we reached
-     end of s2.  */
-  if (!cmp)
-    {
-      if (cmp_z)
-	return (char *) p1;
-
-      pt = p1;
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-    }
-  else
-    {
-      /* Adjust 16B alignment.  */
-      p1 += cmp;
-      pt = p1;
-    }
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  while (!(cmp | cmp_z | cmp_s))
-    {
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, does frag2 has null?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-    }
-
-  /* Full determination yielded a false result, retrace s1 to next
-     starting position.
-     Zflg	1      0      1			0/1
-     Sflg	0      1      1			0/1
-     cmp	na     0      0			>0
-     action   done   done   continue    continue if s2 < s1
-	      false  match  retrace s1     else false
-   */
-
-  if (cmp_s & !cmp)
-    return (char *) pt;
-  if (cmp_z)
-    {
-      if (!cmp_s)
-	return NULL;
-
-      /* Handle both zero and sign flag set and s1 is shorter in
-	 length.  */
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
-      int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
-      int len;
-      int len1;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      __asm ("bsfl %[bmsk1], %[len1]"
-	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
-      if (len >= len1)
-	return NULL;
-    }
-  else if (!cmp)
-    return (char *) pt;
-
-  /* Otherwise, we have to retrace and continue.  Default of multiple
-     paths that need to retrace from next byte in s1.  */
-  p2 = s2;
-  frag2 = strloadu (p2);
-
-  if (!kmp_fwd)
-    kmp_fwd = KMP16Bovrlap (frag2);
-
-  /* KMP algorithm predicted overlap needs to be corrected for
-     partial fragment compare.  */
-  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
-
-  /* Since s2 is at least 16 bytes long, we're certain there is no
-     match.  */
-  if (p1[0] == '\0')
-    return NULL;
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, is there a partial match?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  goto re_trace;
-}
diff --git a/sysdeps/x86_64/sse.h b/sysdeps/x86_64/sse.h
new file mode 100644
index 0000000..8db6d81
--- /dev/null
+++ b/sysdeps/x86_64/sse.h
@@ -0,0 +1,104 @@
+#include <stdint.h>
+
+#include <emmintrin.h>
+#ifdef USE_SSSE3
+  #define _HAS_SSSE3(x,y) x
+  #include <tmmintrin.h>
+#else
+  #define _HAS_SSSE3(x,y) y
+#endif
+#ifdef USE_SSE4_1
+  #define _HAS_SSE4_1(x,y) x
+  #undef  _HAS_SSSE3
+  #define _HAS_SSSE3( x,y) x
+  #include <smmintrin.h>
+#else
+  #define _HAS_SSE4_1(x,y) y
+#endif
+
+typedef __m128i tp_vector;
+typedef unsigned long tp_mask;
+
+#define PREFETCH(x)	_mm_prefetch(((char *)x),_MM_HINT_T0);
+
+SI tp_mask get_mask(tp_vector x){  return  (tp_mask)((unsigned int)_mm_movemask_epi8(x)); }
+SI unsigned int NONZERO_MASK(tp_vector x){ return _HAS_SSE4_1(!_mm_testz_si128(x,x),get_mask(x));         }
+
+SI tp_mask first_bit(tp_mask t,int y){ return __builtin_ctzl(t);}
+
+SI tp_mask bit_i(int i){            return ((tp_mask) 1)<<i;}
+SI tp_mask shift_down(tp_mask x,int y){ return x>>y;}
+SI tp_mask shift_up  (tp_mask x,int y){ return x<<y;}
+
+SI tp_mask forget_first_bit(tp_mask t,int i){return t&(t-1);}
+SI tp_mask forget_before(tp_mask x,int y){return x&((y>=PARA) ? 0 : ((y<0) ? x : shift_up(  (tp_mask)-1,y)));}
+SI tp_mask forget_after( tp_mask x,int y){return x&((y>=PARA) ? x : ((y<0) ? 0 : shift_down((tp_mask)-1,63-y)));}
+SI tp_mask get_bit(tp_mask x,int y){return x&bit_i(y);      }
+
+
+
+
+SI tp_vector BYTE_AT(uchar c,int shift)
+{
+  return _mm_set_epi64x(((uint64_t)c)<<(8*shift),((uint64_t)c)<<(8*(shift-8)));
+}
+
+BIN_OP(TEST_EQ,_mm_cmpeq_epi8(x,y))
+#define TEST_ZERO(x) TEST_EQ(x,vzero)
+BIN_OP(AND   ,_mm_and_si128(x,y))
+BIN_OP(OR    ,_mm_or_si128(x,y))
+BIN_OP(ANDNOT,_mm_andnot_si128(y,x))
+BIN_OP(XOR   ,_mm_xor_si128(x,y))
+
+#define SHIFT_DOWN _mm_srli_si128
+#define SHIFT_UP   _mm_slli_si128
+
+#define CONCAT(x,y,n) ((n==0) ? (y) : ((n==BYTES_AT_ONCE) ? (x) : _HAS_SSSE3( _mm_alignr_epi8(x,y,n),\
+                                                                              OR(SHIFT_UP(x,BYTES_AT_ONCE-(n)),SHIFT_DOWN(y,(n))))))
+
+SI tp_vector BROADCAST(uchar c)
+{
+  return _mm_set1_epi8(c);
+}
+
+
+BIN_OP(ADD,_mm_add_epi8(x,y))
+BIN_OP(SUB,_mm_sub_epi8(x,y))
+
+SI tp_vector TEST_RANGE(tp_vector x,tp_vector y,tp_vector z){
+  /*we do signed comparison */
+  tp_vector fv=ADD(BROADCAST(127),x);
+  tp_vector v=SUB(y,fv);
+  tp_vector tv=SUB(ADD(z,BROADCAST(1)),fv);
+  return _mm_cmplt_epi8(v,tv);
+}
+SI tp_vector TEST_RANGE_C(tp_vector v,uchar from,uchar to){
+	tp_vector fv=BROADCAST(-127-from);
+	v=_mm_add_epi8(v,fv);
+	tp_vector tv=BROADCAST(-127+to-from+1);
+	return _mm_cmplt_epi8(v,tv);
+}
+
+SI tp_vector parallel_tolower(tp_vector m){tp_mask mask; /*TODO sse4 insert*/
+	tp_vector high_bit=BROADCAST(128);
+  tp_vector l= AND(TEST_RANGE_C(m,'A','Z'),high_bit);/*gcc does not know how to fold sse constants*/
+	m=OR(m,_mm_srli_epi64(l,2));
+	if ((mask=get_mask(m))){int i;
+    while(mask){ i=first_bit(mask,i); mask=forget_first_bit(mask,i);
+			((uchar*)&m)[i]=tolower_fixed[((uchar*)&m)[i]];
+    }
+	}
+	return m;
+}
+
+#define LOAD(x) _mm_load_si128((tp_vector*)(x))
+#define LOAD_UNALIGNED(x) _mm_loadu_si128(x)
+
+#if unroll==1
+#define AGREGATE_MASK    mask0
+#elif unroll==2
+#define AGREGATE_MASK   (mask0|(mask1<<16))
+#elif unroll==4
+#define AGREGATE_MASK   (mask0|(mask1<<16))|((mask2|(mask3<<16))<<32)
+/*one dependency less than mask0|(mask1<<16)|(mask2<<32)|(mask3<<48)*/
+#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
deleted file mode 100644
index baf3076..0000000
--- a/sysdeps/x86_64/strchrnul.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/* strchrnul (str, ch) -- Return pointer to first occurrence of CH in STR
-	or terminating NUL byte.
-   For AMD x86-64.
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY (__strchrnul)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	andl	%esi, %ecx
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
-
-1:	bsfl	%ecx, %edx
-	leaq	-16(%rdi,%rdx), %rax
-	ret
-END (__strchrnul)
-
-weak_alias (__strchrnul, strchrnul)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
deleted file mode 100644
index 7b38bf4..0000000
--- a/sysdeps/x86_64/strnlen.S
+++ /dev/null
@@ -1,63 +0,0 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
-   Copyright (C) 2010 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY(__strnlen)
-	movq	%rsi, %rax
-	testq	%rsi, %rsi
-	jz	3f
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
-	movq	%rdi, %r8
-	movq	$16, %r9
-	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %r10d
-	subq	%rdi, %rcx
-	shll	%cl, %r10d
-	subq	%rcx, %r9
-	pmovmskb %xmm2, %edx
-	andl	%r10d, %edx
-	jnz	1f
-	subq	%r9, %rsi
-	jbe	3f
-
-2:	movdqa	16(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	testl	%edx, %edx
-	jnz	1f
-	subq	$16, %rsi
-	jnbe	2b
-3:	ret
-
-1:	subq	%r8, %rdi
-	bsfl	%edx, %edx
-	addq	%rdi, %rdx
-	cmpq	%rdx, %rax
-	cmovnbq	%rdx, %rax
-	ret
-END(__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
deleted file mode 100644
index a5397e7..0000000
--- a/sysdeps/x86_64/strrchr.S
+++ /dev/null
@@ -1,80 +0,0 @@
-/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2009 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY (strrchr)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	xorl	%eax, %eax
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jz	2b
-
-1:	bsfl	%ecx, %r9d
-	movl	$0xffffffff, %r8d
-	movl	$31, %ecx
-	jnz	5f
-
-	bsrl	%edx, %edx
-	jz	2b
-	leaq	-16(%rdi,%rdx), %rax
-	jmp	2b
-
-5:	subl	%r9d, %ecx
-	shrl	%cl, %r8d
-	andl	%r8d, %edx
-	bsrl	%edx, %edx
-	jz	4f
-	leaq	-16(%rdi,%rdx), %rax
-4:	ret
-END (strrchr)
-
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
-- 
1.7.4.4
Follow-Ups:
- Re: [PATCH] vectorized string functions
  - From: Andreas Jaeger
- Re: [PATCH] vectorized string functions
  - From: Rich Felker
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]