This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch, master, updated. glibc-2.14-364-g2d1f3a4

From: drepper at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 15 Oct 2011 15:11:20 -0000
Subject: GNU C Library master sources branch, master, updated. glibc-2.14-364-g2d1f3a4
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  2d1f3a4db65d2731a695dee6b973accea8b9adc0 (commit)
       via  be13f7bff66e1850f9057dd813d6e7be022d9516 (commit)
      from  556a2007974ed39a68c87a8b5181f8057ecd0d6f (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2d1f3a4db65d2731a695dee6b973accea8b9adc0

commit 2d1f3a4db65d2731a695dee6b973accea8b9adc0
Author: Ulrich Drepper <drepper@gmail.com>
Date:   Sat Oct 15 11:11:12 2011 -0400

    Fix WS

diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index b3a2ca1..e35a23e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -8,7 +8,7 @@
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.
 
-   The GNU C Library is distributed in the hope that it will be useful, 
+   The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.
@@ -31,8 +31,8 @@
 # endif
 
 /* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
+	   wmemcmp has to use SIGNED comparison for elements.
+	   memcmp has to use UNSIGNED comparison for elemnts.
 */
 
 	atom_text_section
@@ -1625,7 +1625,7 @@ L(more32bytes):
 # else
 	jmp	L(36bytes)
 # endif
- 
+
 	ALIGN	(4)
 L(more40bytes):
 	cmp	$40, %ecx

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=be13f7bff66e1850f9057dd813d6e7be022d9516

commit be13f7bff66e1850f9057dd813d6e7be022d9516
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date:   Sat Oct 15 11:10:08 2011 -0400

    Optimized memcmp and wmemcmp for x86-64 and x86-32

diff --git a/ChangeLog b/ChangeLog
index 49f091a..414611a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2011-09-27  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	* sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
+	memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+	* sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
+	* sysdeps/x86_64/multiarch/memcmp.S: Update.  Add __memcmp_ssse3.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
+	(USE_AS_WMEMCMP): New macro.
+	Fixing indents.
+	* sysdeps/x86_64/multiarch/wmemcmp.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
+	* sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
+	* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+	wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
+	* sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
+	* sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
+	* sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
+	(USE_AS_WMEMCMP): New macro.
+	* sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
+	* sysdeps/string/test-memcmp.c: Update.
+	Fix simple_wmemcmp.
+	Add new tests.
+	* wcsmbs/wmemcmp.c: Update.
+	(WMEMCMP): New macro.
+	Fix overflow bug.
+
 2011-10-12  Andreas Jaeger  <aj@suse.de>
 
 	[BZ #13268]
diff --git a/NEWS b/NEWS
index 7e9b2c1..cdb2973 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,7 @@ Version 2.15
 * Optimized strchr and strrchr for SSE on x86-32.
   Contributed by Liubov Dmitrieva.
 
-* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
   Contributed by Liubov Dmitrieva.
 
 * New interfaces: scandirat, scandirat64
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 4675bd9..f246d3a 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -29,9 +29,21 @@
 # define MEMCPY wmemcpy
 # define SIMPLE_MEMCMP simple_wmemcmp
 # define CHAR wchar_t
-# define MAX_CHAR 256000
-# define UCHAR uint32_t
+# define UCHAR wchar_t
 # define CHARBYTES 4
+# define CHAR__MIN WCHAR_MIN
+# define CHAR__MAX WCHAR_MAX
+int
+simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
+{
+  int ret = 0;
+  /* Warning!
+	wmemcmp has to use SIGNED comparison for elements.
+	memcmp has to use UNSIGNED comparison for elemnts.
+  */
+  while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
+  return ret;
+}
 #else
 # define MEMCMP memcmp
 # define MEMCPY memcpy
@@ -40,18 +52,20 @@
 # define MAX_CHAR 255
 # define UCHAR unsigned char
 # define CHARBYTES 1
-#endif
-
-typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+# define CHAR__MIN CHAR_MIN
+# define CHAR__MAX CHAR_MAX
 
 int
-SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n)
+simple_memcmp (const char *s1, const char *s2, size_t n)
 {
   int ret = 0;
 
-  while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0);
+  while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
   return ret;
 }
+#endif
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 
 IMPL (SIMPLE_MEMCMP, 0)
 IMPL (MEMCMP, 1)
@@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
   s2 = (CHAR *) (buf2 + align2);
 
   for (i = 0; i < len; i++)
-    s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
+    s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
 
   s1[len] = align1;
   s2[len] = align2;
@@ -412,8 +426,8 @@ check1 (void)
   s2[99] = 1;
   s1[100] = 116;
   s2[100] = 116;
-  s1[101] = -13;
-  s2[101] = -13;
+  s1[101] = CHAR__MIN;
+  s2[101] = CHAR__MAX;
   s1[102] = -109;
   s2[102] = -109;
   s1[103] = 1;
@@ -434,8 +448,8 @@ check1 (void)
   s2[110] = -109;
   s1[111] = 1;
   s2[111] = 1;
-  s1[112] = 20;
-  s2[112] = 20;
+  s1[112] = CHAR__MAX;
+  s2[112] = CHAR__MIN;
   s1[113] = -13;
   s2[113] = -13;
   s1[114] = -109;
@@ -444,9 +458,12 @@ check1 (void)
   s2[115] = 1;
 
   n = 116;
-  exp_result = SIMPLE_MEMCMP (s1, s2, n);
-  FOR_EACH_IMPL (impl, 0)
-    check_result (impl, s1, s2, n, exp_result);
+  for (size_t i = 0; i < n; i++)
+    {
+      exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
+      FOR_EACH_IMPL (impl, 0)
+	check_result (impl, s1 + i, s2 + i, n - i, exp_result);
+    }
 }
 
 int
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 8a4c219..98d1ad6 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
 		   strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
 		   wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
 		   memrchr-sse2 memrchr-sse2-bsf memrchr-c \
-		   rawmemchr-sse2 rawmemchr-sse2-bsf
+		   rawmemchr-sse2 rawmemchr-sse2-bsf \
+		   wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778..1f5dbc1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,84 +20,97 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_2
+# endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN		POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS	4
+# define BLK1	PARMS
+# define BLK2	BLK1 + 4
+# define LEN	BLK2 + 4
+# define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
 
 
-#ifdef SHARED
-# define JMPTBL(I, B)	I - B
+# ifdef SHARED
+#  define JMPTBL(I, B)	I - B
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
-    jmp		*%ebx
-
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
-	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
-	movl	(%esp), %ebx
-	ret
-#else
-# define JMPTBL(I, B)	I
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	call	__i686.get_pc_thunk.bx;	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+	jmp	*%ebx
+# else
+#  define JMPTBL(I, B)	I
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
 	.section .text.sse4.2,"ax",@progbits
 ENTRY (MEMCMP)
 	movl	BLK1(%esp), %eax
 	movl	BLK2(%esp), %edx
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+# else
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
+# endif
+
 	pxor	%xmm0, %xmm0
 	cmp	$64, %ecx
 	ja	L(64bytesormore)
 	cmp	$8, %ecx
-	PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+# else
 	jb	L(less8bytes)
+	PUSH	(%ebx)
+# endif
+
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less8bytes):
 	mov	(%eax), %bl
 	cmpb	(%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
 	mov	6(%eax), %bl
 	cmpb	6(%edx), %bl
 	je	L(0bytes)
+
 L(nonzero):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(above)
 	neg	%eax
 L(above):
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(0bytes):
-	POP (%ebx)
+	POP	(%ebx)
 	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(0bytesend)
 	movzbl	(%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
 	sub	%edx, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(0bytesend):
 	xor	%eax, %eax
 	ret
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(64bytesormore):
-	PUSH (%ebx)
+	PUSH	(%ebx)
 	mov	%ecx, %ebx
 	mov	$64, %ecx
 	sub	$64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
 
-	ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+	.p2align 4
 L(find_16diff):
 	sub	$16, %ecx
 L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
 L(find_64diff):
 	add	%ecx, %edx
 	add	%ecx, %eax
-	jmp	L(16bytes)
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(16bytes):
 	mov	-16(%eax), %ecx
 	mov	-16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(49bytes):
 	movdqu	-49(%eax), %xmm1
 	movdqu	-49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(50bytes):
 	mov	$-50, %ebx
 	movdqu	-50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(51bytes):
 	mov	$-51, %ebx
 	movdqu	-51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(52bytes):
 	movdqu	-52(%eax), %xmm1
 	movdqu	-52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(53bytes):
 	movdqu	-53(%eax), %xmm1
 	movdqu	-53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(54bytes):
 	movdqu	-54(%eax), %xmm1
 	movdqu	-54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(55bytes):
 	movdqu	-55(%eax), %xmm1
 	movdqu	-55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(56bytes):
 	movdqu	-56(%eax), %xmm1
 	movdqu	-56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
 	jnc	L(less16bytes)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(57bytes):
 	movdqu	-57(%eax), %xmm1
 	movdqu	-57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(58bytes):
 	movdqu	-58(%eax), %xmm1
 	movdqu	-58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(59bytes):
 	movdqu	-59(%eax), %xmm1
 	movdqu	-59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
-
-	ALIGN (4)
+# endif
+	.p2align 4
 L(60bytes):
 	movdqu	-60(%eax), %xmm1
 	movdqu	-60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
+
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(61bytes):
 	movdqu	-61(%eax), %xmm1
 	movdqu	-61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(62bytes):
 	movdqu	-62(%eax), %xmm1
 	movdqu	-62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
 	jne	L(end)
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(63bytes):
 	movdqu	-63(%eax), %xmm1
 	movdqu	-63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
 	mov	$0, %eax
 	jne	L(end)
 	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(64bytes):
 	movdqu	-64(%eax), %xmm1
 	movdqu	-64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
 	jnc	L(less16bytes)
 
 	mov	-16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-16(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-16(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-12(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-12(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-8(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-8(%edx), %ecx
+# endif
 	jne	L(find_diff)
 
 	mov	-4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%edx), %ebx
 	cmp	%ebx, %ecx
+# else
+	cmp	-4(%edx), %ecx
+# endif
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less16bytes):
 	add	%ebx, %eax
 	add	%ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
 	mov	$0, %eax
 	jne	L(find_diff)
 	RETURN
+# else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
+# ifndef USE_AS_WMEMCMP
 	cmpb	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
 	jne	L(end)
 	cmp	%bx, %cx
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+# endif
 END (MEMCMP)
 
 	.section .rodata.sse4.2,"a",@progbits
-	ALIGN (2)
+	.p2align 2
 	.type	L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
 L(table_64bytes):
 	.int	JMPTBL (L(0bytes), L(table_64bytes))
 	.int	JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
 	.int	JMPTBL (L(62bytes), L(table_64bytes))
 	.int	JMPTBL (L(63bytes), L(table_64bytes))
 	.int	JMPTBL (L(64bytes), L(table_64bytes))
-	.size	L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+# endif
 #endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15f..eab85c1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,47 +20,64 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_ssse3
-#endif
+# ifndef MEMCMP
+#  define MEMCMP		__memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
-  cfi_rel_offset (REG, 0)
+# define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
-  cfi_restore (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PARMS		4
+# define BLK1		PARMS
+# define BLK2		BLK1+4
+# define LEN		BLK2+4
+# define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
 
-#define PARMS		4
-#define BLK1		PARMS
-#define BLK2		BLK1+4
-#define LEN		BLK2+4
-#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
 
-	.section .text.ssse3,"ax",@progbits
+	atom_text_section
 ENTRY (MEMCMP)
 	movl	LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(zero)
+# endif
+
 	movl	BLK1(%esp), %eax
 	cmp	$48, %ecx
 	movl	BLK2(%esp), %edx
 	jae	L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
-	PUSH (%ebx)
+# endif
+
+	PUSH	(%ebx)
 	add	%ecx, %edx
 	add	%ecx, %eax
 	jmp	L(less48bytes)
 
-	ALIGN (4)
-	CFI_POP (%ebx)
+	CFI_POP	(%ebx)
+
+# ifndef USE_AS_WMEMCMP
+	.p2align 4
 L(less1bytes):
 	jb	L(zero)
 	movb	(%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
 	neg	%eax
 L(1bytesend):
 	ret
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(zero):
-	mov	$0, %eax
+	xor	%eax, %eax
 	ret
 
-	ALIGN (4)
+	.p2align 4
 L(48bytesormore):
-	PUSH (%ebx)
-	PUSH (%esi)
-	PUSH (%edi)
+	PUSH	(%ebx)
+	PUSH	(%esi)
+	PUSH	(%edi)
 	cfi_remember_state
-	movdqu    (%eax), %xmm3
-	movdqu    (%edx), %xmm0
+	movdqu	(%eax), %xmm3
+	movdqu	(%edx), %xmm0
 	movl	%eax, %edi
 	movl	%edx, %esi
-	pcmpeqb   %xmm0, %xmm3
-	pmovmskb  %xmm3, %edx
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
 	lea	16(%edi), %edi
 
-	sub      $0xffff, %edx
+	sub	$0xffff, %edx
 	lea	16(%esi), %esi
-	jnz	  L(less16bytes)
+	jnz	L(less16bytes)
 	mov	%edi, %edx
 	and	$0xf, %edx
 	xor	%edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
 	jz	L(shr_0)
 	xor	%edx, %esi
 
+# ifndef USE_AS_WMEMCMP
 	cmp	$8, %edx
 	jae	L(next_unaligned_table)
 	cmp	$0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
 	je	L(shr_6)
 	jmp	L(shr_7)
 
-	ALIGN (4)
+	.p2align 2
 L(next_unaligned_table):
 	cmp	$8, %edx
 	je	L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
 	cmp	$14, %edx
 	je	L(shr_14)
 	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(shr_0):
 	cmp	$80, %ecx
 	jae	L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_0_gobble):
 	lea	-48(%ecx), %ecx
 	movdqa	(%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_1_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	1(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_2_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	2(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_3_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	3(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_4_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	4(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_5_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	5(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_6_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	6(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_7_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	7(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_8_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	8(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_9_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	9(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_10_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	10(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_11_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	11(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_12_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	12(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
+# ifndef USE_AS_WMEMCMP
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_13_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	13(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_14_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	14(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15):
 	cmp	$80, %ecx
 	lea	-48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
 	jnz	L(exit)
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shr_15_gobble):
 	sub	$32, %ecx
 	movdqa	16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
 
 	lea	(%ecx, %edi,1), %eax
 	lea	15(%ecx, %esi,1), %edx
-	POP (%edi)
-	POP (%esi)
+	POP	(%edi)
+	POP	(%esi)
 	jmp	L(less48bytes)
+# endif
 
 	cfi_restore_state
 	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(exit):
 	pmovmskb %xmm1, %ebx
 	sub	$0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
 	lea	-16(%esi), %esi
 	lea	-16(%edi), %edi
 	mov	%ebx, %edx
+
 L(first16bytes):
 	add	%eax, %esi
 L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
 	test	%dl, %dl
 	jz	L(next_24_bytes)
 
@@ -1492,61 +1531,61 @@ L(less16bytes):
 	test	$0x40, %dl
 	jnz	L(Byte22)
 L(Byte23):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte16):
-	movzbl	 -16(%edi), %eax
-	movzbl	 -16(%esi), %edx
+	movzbl	-16(%edi), %eax
+	movzbl	-16(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte17):
-	movzbl	 -15(%edi), %eax
-	movzbl	 -15(%esi), %edx
+	movzbl	-15(%edi), %eax
+	movzbl	-15(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte18):
-	movzbl	 -14(%edi), %eax
-	movzbl	 -14(%esi), %edx
+	movzbl	-14(%edi), %eax
+	movzbl	-14(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte19):
-	movzbl	 -13(%edi), %eax
-	movzbl	 -13(%esi), %edx
+	movzbl	-13(%edi), %eax
+	movzbl	-13(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte20):
-	movzbl	 -12(%edi), %eax
-	movzbl	 -12(%esi), %edx
+	movzbl	-12(%edi), %eax
+	movzbl	-12(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte21):
-	movzbl	 -11(%edi), %eax
-	movzbl	 -11(%esi), %edx
+	movzbl	-11(%edi), %eax
+	movzbl	-11(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(Byte22):
-	movzbl	 -10(%edi), %eax
-	movzbl	 -10(%esi), %edx
+	movzbl	-10(%edi), %eax
+	movzbl	-10(%esi), %edx
 	sub	%edx, %eax
 	RETURN
 
-	ALIGN (4)
+	.p2align 4
 L(next_24_bytes):
 	lea	8(%edi), %edi
 	lea	8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
 	test	$0x40, %dh
 	jnz	L(Byte22)
 
-	ALIGN (4)
+	.p2align 4
 L(Byte31):
-	movzbl	 -9(%edi), %eax
-	movzbl	 -9(%esi), %edx
+	movzbl	-9(%edi), %eax
+	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN_END
+# else
+
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%edi), %eax
+	cmp	-16(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(second_double_word):
+	mov	-12(%edi), %eax
+	cmp	-12(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%edi), %eax
+	cmp	-8(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(fourth_double_word):
+	mov	-4(%edi), %eax
+	cmp	-4(%esi), %eax
+	jne	L(nequal)
+	RETURN
+
+	.p2align 4
+L(nequal):
+	mov	$1, %eax
+	jg	L(nequal_bigger)
+	neg	%eax
+	RETURN
+
+	.p2align 4
+L(nequal_bigger):
+	RETURN_END
+# endif
 
 	CFI_PUSH (%ebx)
-	ALIGN (4)
+
+	.p2align 4
 L(more8bytes):
 	cmp	$16, %ecx
 	jae	L(more16bytes)
 	cmp	$8, %ecx
 	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$9, %ecx
 	je	L(9bytes)
 	cmp	$10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
 	cmp	$14, %ecx
 	je	L(14bytes)
 	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more16bytes):
 	cmp	$24, %ecx
 	jae	L(more24bytes)
 	cmp	$16, %ecx
 	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$17, %ecx
 	je	L(17bytes)
 	cmp	$18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
 	cmp	$22, %ecx
 	je	L(22bytes)
 	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more24bytes):
 	cmp	$32, %ecx
 	jae	L(more32bytes)
 	cmp	$24, %ecx
 	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$25, %ecx
 	je	L(25bytes)
 	cmp	$26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
 	cmp	$30, %ecx
 	je	L(30bytes)
 	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more32bytes):
 	cmp	$40, %ecx
 	jae	L(more40bytes)
 	cmp	$32, %ecx
 	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$33, %ecx
 	je	L(33bytes)
 	cmp	$34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
 	cmp	$38, %ecx
 	je	L(38bytes)
 	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+
+	.p2align 4
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
 
-	ALIGN (4)
+	.p2align 4
 L(more40bytes):
 	cmp	$40, %ecx
 	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
 	cmp	$41, %ecx
 	je	L(41bytes)
 	cmp	$42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
 	je	L(46bytes)
 	jmp	L(47bytes)
 
-	ALIGN (4)
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-
-	ALIGN (4)
+	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
 	mov	-44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
 	cmp	%ebx, %ecx
 	mov	$0, %eax
 	jne	L(find_diff)
-	POP (%ebx)
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+# else
+	.p2align 4
+L(44bytes):
+	mov	-44(%eax), %ecx
+	cmp	-44(%edx), %ecx
+	jne	L(find_diff)
+L(40bytes):
+	mov	-40(%eax), %ecx
+	cmp	-40(%edx), %ecx
+	jne	L(find_diff)
+L(36bytes):
+	mov	-36(%eax), %ecx
+	cmp	-36(%edx), %ecx
+	jne	L(find_diff)
+L(32bytes):
+	mov	-32(%eax), %ecx
+	cmp	-32(%edx), %ecx
+	jne	L(find_diff)
+L(28bytes):
+	mov	-28(%eax), %ecx
+	cmp	-28(%edx), %ecx
+	jne	L(find_diff)
+L(24bytes):
+	mov	-24(%eax), %ecx
+	cmp	-24(%edx), %ecx
+	jne	L(find_diff)
+L(20bytes):
+	mov	-20(%eax), %ecx
+	cmp	-20(%edx), %ecx
+	jne	L(find_diff)
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	xor	%eax, %eax
+	cmp	-4(%edx), %ecx
+	jne	L(find_diff)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
+# endif
 
-	ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+	.p2align 4
 L(45bytes):
 	mov	-45(%eax), %ecx
 	mov	-45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
 	cmp	-1(%edx), %cl
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(46bytes):
 	mov	-46(%eax), %ecx
 	mov	-46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
 	cmp	%bh, %ch
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(47bytes):
 	movl	-47(%eax), %ecx
 	movl	-47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
 	cmpb	-1(%edx), %al
 	mov	$0, %eax
 	jne	L(end)
-	POP (%ebx)
+	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
 
-	ALIGN (4)
+	.p2align 4
 L(find_diff):
 	cmpb	%bl, %cl
 	jne	L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
 	cmp	%bl, %cl
 	jne	L(end)
 	cmp	%bx, %cx
+
+	.p2align 4
 L(end):
-	POP (%ebx)
+	POP	(%ebx)
 	mov	$1, %eax
 	ja	L(bigger)
 	neg	%eax
 L(bigger):
 	ret
+# else
 
-END (MEMCMP)
+/* for wmemcmp */
+	.p2align 4
+L(find_diff):
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
 
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+# endif
+END (MEMCMP)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..94ff615
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP  __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..1a857c7
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
similarity index 53%
copy from sysdeps/x86_64/multiarch/memcmp.S
copy to sysdeps/i386/i686/multiarch/wmemcmp.S
index 301ab28..5080c14 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -1,5 +1,5 @@
-/* Multiple versions of memcmp
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* Multiple versions of wmemcmp
+   Copyright (C)  2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -22,38 +22,38 @@
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in libc. */
+
 #ifndef NOT_IN_libc
+	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+	.globl	__i686.get_pc_thunk.bx
+	.hidden	__i686.get_pc_thunk.bx
+	.p2align 4
+	.type	__i686.get_pc_thunk.bx,@function
+	__i686.get_pc_thunk.bx:
+	movl	(%esp), %ebx
+	ret
+
 	.text
-ENTRY(memcmp)
-	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
+	pushl	%ebx
+	cfi_adjust_cfa_offset (4)
+	cfi_rel_offset (ebx, 0)
+	call	__i686.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__memcmp_sse2(%rip), %rax
-	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+1:	leal	__wmemcmp_ia32@GOTOFF(%ebx), %eax
+	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
 	jz	2f
-	leaq	__memcmp_sse4_1(%rip), %rax
-2:	ret
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcmp_sse2, @function; \
-	.p2align 4; \
-	__memcmp_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
+	leal	__wmemcmp_ssse3@GOTOFF(%ebx), %eax
+	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	jz	2f
+	leal	__wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2:	popl	%ebx
+	cfi_adjust_cfa_offset (-4)
+	cfi_restore (ebx)
+	ret
+END(wmemcmp)
 #endif
-
-#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a5254dc..e0bb984 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strrchr-sse2-no-bsf strchr-sse2-no-bsf
+		   strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+		   memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index fc439bb..28dd505 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.1
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -20,43 +20,54 @@
 
 #ifndef NOT_IN_libc
 
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
 
-#ifndef MEMCMP
-# define MEMCMP		__memcmp_sse4_1
-#endif
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse4_1
+# endif
 
-#ifndef ALIGN
-# define ALIGN(n)	.p2align n
-#endif
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
 
-#define JMPTBL(I, B)	(I - B)
+# define JMPTBL(I, B)	(I - B)
 
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
   lea		TABLE(%rip), %r11;				\
   movslq	(%r11, INDEX, SCALE), %rcx;			\
   add		%r11, %rcx;					\
   jmp		*%rcx;						\
   ud2
 
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
 	pxor	%xmm0, %xmm0
 	cmp	$79, %rdx
 	ja	L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
 	cmp	$1, %rdx
 	je	L(firstbyte)
+# endif
 	add	%rdx, %rsi
 	add	%rdx, %rdi
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 
+# ifndef USE_AS_WMEMCMP
 	ALIGN (4)
 L(firstbyte):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
 	sub	%ecx, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(79bytesormore):
@@ -308,11 +319,11 @@ L(less32bytesin256):
 
 	ALIGN (4)
 L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
 	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
 	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
 
 	ALIGN (4)
 L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
 	mov	$DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
 	mov	__x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
 	mov	%r8, %r9
 	shr	$1, %r8
 	add	%r9, %r8
@@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
 L(L2_L3_cache_aglined):
 	sub	$64, %rdx
+
 	ALIGN (4)
 L(L2_L3_aligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
@@ -803,13 +815,19 @@ L(12bytes):
 	jne	L(diffin8bytes)
 L(4bytes):
 	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
 	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 L(0bytes):
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
 	ALIGN (4)
 L(65bytes):
 	movdqu	-65(%rdi), %xmm1
@@ -1017,6 +1035,7 @@ L(1bytes):
 	movzbl	-1(%rsi), %ecx
 	sub	%ecx, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(68bytes):
@@ -1047,13 +1066,20 @@ L(20bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
-	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(69bytes):
 	movdqu	-69(%rsi), %xmm1
@@ -1161,6 +1187,7 @@ L(23bytes):
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
+# endif
 
 	ALIGN (4)
 L(72bytes):
@@ -1191,13 +1218,16 @@ L(24bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
-	mov	-8(%rdi), %rax
+
 	mov	-8(%rsi), %rcx
+	mov	-8(%rdi), %rax
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(73bytes):
 	movdqu	-73(%rsi), %xmm1
@@ -1312,7 +1342,7 @@ L(27bytes):
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
-
+# endif
 	ALIGN (4)
 L(76bytes):
 	movdqu	-76(%rsi), %xmm1
@@ -1346,13 +1376,19 @@ L(28bytes):
 	mov	-12(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+	mov	-4(%rdi), %eax
 	cmp	%eax, %ecx
+# else
+	cmp	-4(%rdi), %ecx
+# endif
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
 
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
 	ALIGN (4)
 L(77bytes):
 	movdqu	-77(%rsi), %xmm1
@@ -1474,7 +1510,7 @@ L(31bytes):
 	jne	L(diffin8bytes)
 	xor	%eax, %eax
 	ret
-
+# endif
 	ALIGN (4)
 L(64bytes):
 	movdqu	-64(%rdi), %xmm2
@@ -1527,7 +1563,17 @@ L(diffin8bytes):
 	jne	L(diffin4bytes)
 	shr	$32, %rcx
 	shr	$32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
+# endif
+
 L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
 	cmp	%cx, %ax
 	jne	L(diffin2bytes)
 	shr	$16, %ecx
@@ -1546,11 +1592,28 @@ L(end):
 	and	$0xff, %ecx
 	sub	%ecx, %eax
 	ret
+# else
+
+/* for wmemcmp */
+	mov	$1, %eax
+	jl	L(nequal_bigger)
+	neg	%eax
+	ret
+
+	ALIGN (4)
+L(nequal_bigger):
+	ret
+
+L(unreal_case):
+	xor	%eax, %eax
+	ret
+# endif
 
 END (MEMCMP)
 
 	.section .rodata.sse4.1,"a",@progbits
 	ALIGN (3)
+# ifndef USE_AS_WMEMCMP
 L(table_64bytes):
 	.int	JMPTBL (L(0bytes), L(table_64bytes))
 	.int	JMPTBL (L(1bytes), L(table_64bytes))
@@ -1632,4 +1695,87 @@ L(table_64bytes):
 	.int	JMPTBL (L(77bytes), L(table_64bytes))
 	.int	JMPTBL (L(78bytes), L(table_64bytes))
 	.int	JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(68bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(72bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(76bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000..b3a2ca1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1997 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful, 
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_ssse3
+# endif
+
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+	test	%rdx, %rdx
+	jz	L(equal)
+# endif
+	mov	%rdx, %rcx
+	mov	%rdi, %rdx
+	cmp	$48, %rcx;
+	jae	L(48bytesormore)	/* LEN => 48  */
+
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+/* ECX >= 32.  */
+L(48bytesormore):
+	movdqu	(%rdi), %xmm3
+	movdqu	(%rsi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	16(%rdi), %rdi
+	lea	16(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(less16bytes)
+	mov	%edi, %edx
+	and	$0xf, %edx
+	xor	%rdx, %rdi
+	sub	%rdx, %rsi
+	add	%rdx, %rcx
+	mov	%esi, %edx
+	and	$0xf, %edx
+	jz	L(shr_0)
+	xor	%rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+	cmp	$8, %edx
+	jae	L(next_unaligned_table)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$1, %edx
+	je	L(shr_1)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$3, %edx
+	je	L(shr_3)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$5, %edx
+	je	L(shr_5)
+	cmp	$6, %edx
+	je	L(shr_6)
+	jmp	L(shr_7)
+
+	ALIGN	(2)
+L(next_unaligned_table):
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$9, %edx
+	je	L(shr_9)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$11, %edx
+	je	L(shr_11)
+	cmp	$12, %edx
+	je	L(shr_12)
+	cmp	$13, %edx
+	je	L(shr_13)
+	cmp	$14, %edx
+	je	L(shr_14)
+	jmp	L(shr_15)
+# else
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$8, %edx
+	je	L(shr_8)
+	jmp	L(shr_12)
+# endif
+
+	ALIGN	(4)
+L(shr_0):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	jae	L(shr_0_gobble)
+	xor	%eax, %eax
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+	pand	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_0_gobble):
+	movdqa	(%rsi), %xmm0
+	xor	%eax, %eax
+	pcmpeqb	(%rdi), %xmm0
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+	pand	%xmm0, %xmm2
+	sub	$32, %rcx
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	movdqa	32(%rsi), %xmm0
+	movdqa	48(%rsi), %xmm2
+	sbb	$0xffff, %edx
+	pcmpeqb	32(%rdi), %xmm0
+	pcmpeqb	48(%rdi), %xmm2
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	jz	L(shr_0_gobble_loop)
+
+	pand	%xmm0, %xmm2
+	cmp	$0, %rcx
+	jge	L(next)
+	inc	%edx
+	add	$32, %rcx
+L(next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm2, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_1):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_1_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$1, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$1, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_1_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$1, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$1, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$1, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$1, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_1_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_1_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_1_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	1(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+
+	ALIGN	(4)
+L(shr_2):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_2_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$2, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$2, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_2_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$2, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$2, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$2, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$2, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_2_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_2_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_2_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	2(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_3):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_3_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$3, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$3, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_3_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$3, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$3, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$3, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$3, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_3_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_3_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_3_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	3(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_4):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_4_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$4, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$4, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_4_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$4, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$4, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$4, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$4, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_4_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_4_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_4_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	4(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_5):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_5_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$5, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$5, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_5_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$5, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$5, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$5, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$5, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_5_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_5_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_5_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	5(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_6):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_6_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$6, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$6, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_6_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$6, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$6, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$6, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$6, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_6_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_6_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_6_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	6(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_7):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_7_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$7, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$7, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_7_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$7, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$7, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$7, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$7, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_7_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_7_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_7_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	7(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_8):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_8_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$8, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$8, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_8_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$8, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$8, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$8, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$8, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_8_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_8_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_8_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	8(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_9):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_9_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$9, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$9, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_9_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$9, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$9, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$9, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$9, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_9_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_9_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_9_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	9(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_10):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_10_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$10, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$10, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_10_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$10, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$10, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$10, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$10, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_10_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_10_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_10_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	10(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_11):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_11_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$11, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$11, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_11_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$11, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$11, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$11, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$11, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_11_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_11_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_11_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	11(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# endif
+
+	ALIGN	(4)
+L(shr_12):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_12_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$12, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$12, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_12_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$12, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$12, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$12, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$12, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_12_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_12_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_12_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	12(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+	ALIGN	(4)
+L(shr_13):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_13_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$13, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$13, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_13_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$13, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$13, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$13, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$13, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_13_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_13_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_13_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	13(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_14):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_14_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$14, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$14, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_14_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$14, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$14, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$14, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$14, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_14_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_14_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_14_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	14(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_15):
+	cmp	$80, %rcx
+	lea	-48(%rcx), %rcx
+	mov	%edx, %eax
+	jae	L(shr_15_gobble)
+
+	movdqa	16(%rsi), %xmm1
+	movdqa	%xmm1, %xmm2
+	palignr	$15, (%rsi), %xmm1
+	pcmpeqb	(%rdi), %xmm1
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, %xmm2, %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+	add	$15, %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+
+	ALIGN	(4)
+L(shr_15_gobble):
+	sub	$32, %rcx
+	movdqa	16(%rsi), %xmm0
+	palignr	$15, (%rsi), %xmm0
+	pcmpeqb	(%rdi), %xmm0
+
+	movdqa	32(%rsi), %xmm3
+	palignr	$15, 16(%rsi), %xmm3
+	pcmpeqb	16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+	pand	%xmm0, %xmm3
+	sub	$32, %rcx
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+
+	movdqa	64(%rsi), %xmm3
+	palignr	$15, 48(%rsi), %xmm3
+	sbb	$0xffff, %edx
+	movdqa	48(%rsi), %xmm0
+	palignr	$15, 32(%rsi), %xmm0
+	pcmpeqb	32(%rdi), %xmm0
+	lea	32(%rsi), %rsi
+	pcmpeqb	48(%rdi), %xmm3
+
+	lea	32(%rdi), %rdi
+	jz	L(shr_15_gobble_loop)
+	pand	%xmm0, %xmm3
+
+	cmp	$0, %rcx
+	jge	L(shr_15_gobble_next)
+	inc	%edx
+	add	$32, %rcx
+L(shr_15_gobble_next):
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pmovmskb %xmm3, %edx
+	movdqa	%xmm0, %xmm1
+	lea	32(%rdi), %rdi
+	lea	32(%rsi), %rsi
+	sub	$0xffff, %edx
+	jnz	L(exit)
+
+	lea	15(%rsi), %rsi
+	add	%rcx, %rsi
+	add	%rcx, %rdi
+	jmp	L(less48bytes)
+# endif
+	ALIGN	(4)
+L(exit):
+	pmovmskb %xmm1, %r8d
+	sub	$0xffff, %r8d
+	jz	L(first16bytes)
+	lea	-16(%rsi), %rsi
+	lea	-16(%rdi), %rdi
+	mov	%r8d, %edx
+L(first16bytes):
+	add	%rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+	test	%dl, %dl
+	jz	L(next_24_bytes)
+
+	test	$0x01, %dl
+	jnz	L(Byte16)
+
+	test	$0x02, %dl
+	jnz	L(Byte17)
+
+	test	$0x04, %dl
+	jnz	L(Byte18)
+
+	test	$0x08, %dl
+	jnz	L(Byte19)
+
+	test	$0x10, %dl
+	jnz	L(Byte20)
+
+	test	$0x20, %dl
+	jnz	L(Byte21)
+
+	test	$0x40, %dl
+	jnz	L(Byte22)
+
+	movzbl	-9(%rdi), %eax
+	movzbl	-9(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte16):
+	movzbl	-16(%rdi), %eax
+	movzbl	-16(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte17):
+	movzbl	-15(%rdi), %eax
+	movzbl	-15(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte18):
+	movzbl	-14(%rdi), %eax
+	movzbl	-14(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte19):
+	movzbl	-13(%rdi), %eax
+	movzbl	-13(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte20):
+	movzbl	-12(%rdi), %eax
+	movzbl	-12(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte21):
+	movzbl	-11(%rdi), %eax
+	movzbl	-11(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(Byte22):
+	movzbl	-10(%rdi), %eax
+	movzbl	-10(%rsi), %edx
+	sub	%edx, %eax
+	ret
+
+	ALIGN	(4)
+L(next_24_bytes):
+	lea	8(%rdi), %rdi
+	lea	8(%rsi), %rsi
+	test	$0x01, %dh
+	jnz	L(Byte16)
+
+	test	$0x02, %dh
+	jnz	L(Byte17)
+
+	test	$0x04, %dh
+	jnz	L(Byte18)
+
+	test	$0x08, %dh
+	jnz	L(Byte19)
+
+	test	$0x10, %dh
+	jnz	L(Byte20)
+
+	test	$0x20, %dh
+	jnz	L(Byte21)
+
+	test	$0x40, %dh
+	jnz	L(Byte22)
+
+	mov	-9(%rdi), %eax
+	and	$0xff, %eax
+	mov	-9(%rsi), %edx
+	and	$0xff, %edx
+	sub	%edx, %eax
+	ret
+# else
+/* special for wmemcmp */
+	xor	%eax, %eax
+	test	%dl, %dl
+	jz	L(next_two_double_words)
+	and	$15, %dl
+	jz	L(second_double_word)
+	mov	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(second_double_word):
+	mov	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(next_two_double_words):
+	and	$15, %dh
+	jz	L(fourth_double_word)
+	mov	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+	ret
+
+	ALIGN	(4)
+L(fourth_double_word):
+	mov	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+	ret
+# endif
+
+	ALIGN	(4)
+L(less48bytes):
+	cmp	$8, %ecx
+	jae	L(more8bytes)
+	cmp	$0, %ecx
+	je	L(0bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$1, %ecx
+	je	L(1bytes)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$3, %ecx
+	je	L(3bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	cmp	$5, %ecx
+	je	L(5bytes)
+	cmp	$6, %ecx
+	je	L(6bytes)
+	jmp	L(7bytes)
+# else
+	jmp	L(4bytes)
+# endif
+
+	ALIGN	(4)
+L(more8bytes):
+	cmp	$16, %ecx
+	jae	L(more16bytes)
+	cmp	$8, %ecx
+	je	L(8bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$9, %ecx
+	je	L(9bytes)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$11, %ecx
+	je	L(11bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	cmp	$13, %ecx
+	je	L(13bytes)
+	cmp	$14, %ecx
+	je	L(14bytes)
+	jmp	L(15bytes)
+# else
+	jmp	L(12bytes)
+# endif
+
+	ALIGN	(4)
+L(more16bytes):
+	cmp	$24, %ecx
+	jae	L(more24bytes)
+	cmp	$16, %ecx
+	je	L(16bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$17, %ecx
+	je	L(17bytes)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$19, %ecx
+	je	L(19bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	cmp	$21, %ecx
+	je	L(21bytes)
+	cmp	$22, %ecx
+	je	L(22bytes)
+	jmp	L(23bytes)
+# else
+	jmp	L(20bytes)
+# endif
+
+	ALIGN	(4)
+L(more24bytes):
+	cmp	$32, %ecx
+	jae	L(more32bytes)
+	cmp	$24, %ecx
+	je	L(24bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$25, %ecx
+	je	L(25bytes)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$27, %ecx
+	je	L(27bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	cmp	$29, %ecx
+	je	L(29bytes)
+	cmp	$30, %ecx
+	je	L(30bytes)
+	jmp	L(31bytes)
+# else
+	jmp	L(28bytes)
+# endif
+
+	ALIGN	(4)
+L(more32bytes):
+	cmp	$40, %ecx
+	jae	L(more40bytes)
+	cmp	$32, %ecx
+	je	L(32bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$33, %ecx
+	je	L(33bytes)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$35, %ecx
+	je	L(35bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	cmp	$37, %ecx
+	je	L(37bytes)
+	cmp	$38, %ecx
+	je	L(38bytes)
+	jmp	L(39bytes)
+# else
+	jmp	L(36bytes)
+# endif
+ 
+	ALIGN	(4)
+L(more40bytes):
+	cmp	$40, %ecx
+	je	L(40bytes)
+# ifndef USE_AS_WMEMCMP
+	cmp	$41, %ecx
+	je	L(41bytes)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$43, %ecx
+	je	L(43bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	cmp	$45, %ecx
+	je	L(45bytes)
+	cmp	$46, %ecx
+	je	L(46bytes)
+	jmp	L(47bytes)
+
+	ALIGN	(4)
+L(44bytes):
+	movl	-44(%rdi), %eax
+	movl	-44(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	movl	-40(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	movl	-36(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	movl	-32(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	movl	-28(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	movl	-24(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	movl	-20(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	movl	-16(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	movl	-12(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	movl	-8(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	movl	-4(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# else
+	ALIGN	(4)
+L(44bytes):
+	movl	-44(%rdi), %eax
+	cmp	-44(%rsi), %eax
+	jne	L(find_diff)
+L(40bytes):
+	movl	-40(%rdi), %eax
+	cmp	-40(%rsi), %eax
+	jne	L(find_diff)
+L(36bytes):
+	movl	-36(%rdi), %eax
+	cmp	-36(%rsi), %eax
+	jne	L(find_diff)
+L(32bytes):
+	movl	-32(%rdi), %eax
+	cmp	-32(%rsi), %eax
+	jne	L(find_diff)
+L(28bytes):
+	movl	-28(%rdi), %eax
+	cmp	-28(%rsi), %eax
+	jne	L(find_diff)
+L(24bytes):
+	movl	-24(%rdi), %eax
+	cmp	-24(%rsi), %eax
+	jne	L(find_diff)
+L(20bytes):
+	movl	-20(%rdi), %eax
+	cmp	-20(%rsi), %eax
+	jne	L(find_diff)
+L(16bytes):
+	movl	-16(%rdi), %eax
+	cmp	-16(%rsi), %eax
+	jne	L(find_diff)
+L(12bytes):
+	movl	-12(%rdi), %eax
+	cmp	-12(%rsi), %eax
+	jne	L(find_diff)
+L(8bytes):
+	movl	-8(%rdi), %eax
+	cmp	-8(%rsi), %eax
+	jne	L(find_diff)
+L(4bytes):
+	movl	-4(%rdi), %eax
+	cmp	-4(%rsi), %eax
+	jne	L(find_diff)
+L(0bytes):
+	xor	%eax, %eax
+	ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+	ALIGN	(4)
+L(45bytes):
+	movl	-45(%rdi), %eax
+	movl	-45(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(41bytes):
+	movl	-41(%rdi), %eax
+	movl	-41(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(37bytes):
+	movl	-37(%rdi), %eax
+	movl	-37(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(33bytes):
+	movl	-33(%rdi), %eax
+	movl	-33(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(29bytes):
+	movl	-29(%rdi), %eax
+	movl	-29(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(25bytes):
+	movl	-25(%rdi), %eax
+	movl	-25(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(21bytes):
+	movl	-21(%rdi), %eax
+	movl	-21(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(17bytes):
+	movl	-17(%rdi), %eax
+	movl	-17(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(13bytes):
+	movl	-13(%rdi), %eax
+	movl	-13(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(9bytes):
+	movl	-9(%rdi), %eax
+	movl	-9(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(5bytes):
+	movl	-5(%rdi), %eax
+	movl	-5(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(1bytes):
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(46bytes):
+	movl	-46(%rdi), %eax
+	movl	-46(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(42bytes):
+	movl	-42(%rdi), %eax
+	movl	-42(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(38bytes):
+	movl	-38(%rdi), %eax
+	movl	-38(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(34bytes):
+	movl	-34(%rdi), %eax
+	movl	-34(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(30bytes):
+	movl	-30(%rdi), %eax
+	movl	-30(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(26bytes):
+	movl	-26(%rdi), %eax
+	movl	-26(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(22bytes):
+	movl	-22(%rdi), %eax
+	movl	-22(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(18bytes):
+	movl	-18(%rdi), %eax
+	movl	-18(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(14bytes):
+	movl	-14(%rdi), %eax
+	movl	-14(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(10bytes):
+	movl	-10(%rdi), %eax
+	movl	-10(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(6bytes):
+	movl	-6(%rdi), %eax
+	movl	-6(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%rdi), %eax
+	movzwl	-2(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(47bytes):
+	movl	-47(%rdi), %eax
+	movl	-47(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(43bytes):
+	movl	-43(%rdi), %eax
+	movl	-43(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(39bytes):
+	movl	-39(%rdi), %eax
+	movl	-39(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(35bytes):
+	movl	-35(%rdi), %eax
+	movl	-35(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(31bytes):
+	movl	-31(%rdi), %eax
+	movl	-31(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(27bytes):
+	movl	-27(%rdi), %eax
+	movl	-27(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(23bytes):
+	movl	-23(%rdi), %eax
+	movl	-23(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(19bytes):
+	movl	-19(%rdi), %eax
+	movl	-19(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%rdi), %eax
+	movl	-15(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%rdi), %eax
+	movl	-11(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%rdi), %eax
+	movl	-7(%rsi), %ecx
+	cmp	%ecx, %eax
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%rdi), %eax
+	movzwl	-3(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+	cmp	%ecx, %eax
+	jne	L(set)
+	movzbl	-1(%rdi), %eax
+	cmpb	-1(%rsi), %al
+	jne	L(set)
+	xor	%eax, %eax
+	ret
+
+	ALIGN	(4)
+L(find_diff):
+	cmpb	%cl, %al
+	jne	L(set)
+	cmpw	%cx, %ax
+	jne	L(set)
+	shr	$16, %eax
+	shr	$16, %ecx
+	cmpb	%cl, %al
+	jne	L(set)
+
+/* We get there only if we already know there is a
+difference.  */
+
+	cmp	%ecx, %eax
+L(set):
+	sbb	%eax, %eax
+	sbb	$-1, %eax
+	ret
+# else
+
+/* for wmemcmp */
+	ALIGN	(4)
+L(find_diff):
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	ALIGN	(4)
+L(find_diff_bigger):
+	ret
+# endif
+
+	ALIGN	(4)
+L(equal):
+	xor	%eax, %eax
+	ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 301ab28..8bf8f3a 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,5 +1,5 @@
 /* Multiple versions of memcmp
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -29,11 +29,20 @@ ENTRY(memcmp)
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__memcmp_sse2(%rip), %rax
-	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-	jz	2f
+
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__memcmp_sse2(%rip), %rax
+	ret
+
+2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	3f
 	leaq	__memcmp_sse4_1(%rip), %rax
-2:	ret
+	ret
+
+3:	leaq	__memcmp_ssse3(%rip), %rax
+	ret
+
 END(memcmp)
 
 # undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..793f059
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP  __wmemcmp_sse2
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..b07973a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
similarity index 55%
copy from sysdeps/x86_64/multiarch/memcmp.S
copy to sysdeps/x86_64/multiarch/wmemcmp.S
index 301ab28..7c3b7ed 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -1,5 +1,5 @@
-/* Multiple versions of memcmp
-   Copyright (C) 2010 Free Software Foundation, Inc.
+/* Multiple versions of wmemcmp
+   Copyright (C)  2011 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
@@ -24,36 +24,24 @@
 /* Define multiple versions only for the definition in libc. */
 #ifndef NOT_IN_libc
 	.text
-ENTRY(memcmp)
-	.type	memcmp, @gnu_indirect_function
+ENTRY(wmemcmp)
+	.type	wmemcmp, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__memcmp_sse2(%rip), %rax
-	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
-	jz	2f
-	leaq	__memcmp_sse4_1(%rip), %rax
-2:	ret
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcmp_sse2, @function; \
-	.p2align 4; \
-	__memcmp_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
-#endif
 
-#include "../memcmp.S"
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__wmemcmp_sse2(%rip), %rax
+	ret
+
+2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	jz	3f
+	leaq	__wmemcmp_sse4_1(%rip), %rax
+	ret
+
+3:	leaq	__wmemcmp_ssse3(%rip), %rax
+	ret
+
+END(wmemcmp)
+#endif
diff --git a/wcsmbs/wmemcmp.c b/wcsmbs/wmemcmp.c
index c6a321b..e7edc87 100644
--- a/wcsmbs/wmemcmp.c
+++ b/wcsmbs/wmemcmp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
 
@@ -19,9 +19,12 @@
 
 #include <wchar.h>
 
+#ifndef WMEMCMP
+# define wmemcmp
+#endif
 
 int
-wmemcmp (s1, s2, n)
+WMEMCMP (s1, s2, n)
      const wchar_t *s1;
      const wchar_t *s2;
      size_t n;
@@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[1];
       c2 = (wint_t) s2[1];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[2];
       c2 = (wint_t) s2[2];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       c1 = (wint_t) s1[3];
       c2 = (wint_t) s2[3];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       s1 += 4;
       s2 += 4;
       n -= 4;
@@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       ++s1;
       ++s2;
       --n;
@@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
       ++s1;
       ++s2;
       --n;
@@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
       c1 = (wint_t) s1[0];
       c2 = (wint_t) s2[0];
       if (c1 - c2 != 0)
-	return c1 - c2;
+	return c1 > c2 ? 1 : -1;
     }
 
   return 0;

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                   |   29 +
 NEWS                                        |    2 +-
 string/test-memcmp.c                        |   47 +-
 sysdeps/i386/i686/multiarch/Makefile        |    3 +-
 sysdeps/i386/i686/multiarch/memcmp-sse4.S   |  396 +++++--
 sysdeps/i386/i686/multiarch/memcmp-ssse3.S  |  565 +++++---
 sysdeps/i386/i686/multiarch/wmemcmp-c.c     |    5 +
 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S  |    4 +
 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S |    4 +
 sysdeps/i386/i686/multiarch/wmemcmp.S       |   59 +
 sysdeps/x86_64/multiarch/Makefile           |    3 +-
 sysdeps/x86_64/multiarch/memcmp-sse4.S      |  192 +++-
 sysdeps/x86_64/multiarch/memcmp-ssse3.S     | 1997 +++++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcmp.S           |   19 +-
 sysdeps/x86_64/multiarch/wmemcmp-c.c        |    5 +
 sysdeps/x86_64/multiarch/wmemcmp-sse4.S     |    4 +
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S    |    4 +
 sysdeps/x86_64/multiarch/wmemcmp.S          |   47 +
 wcsmbs/wmemcmp.c                            |   21 +-
 19 files changed, 3070 insertions(+), 336 deletions(-)
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
 create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp.S
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp.S


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]