This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.14-364-g2d1f3a4
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 15 Oct 2011 15:11:20 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.14-364-g2d1f3a4
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 2d1f3a4db65d2731a695dee6b973accea8b9adc0 (commit)
via be13f7bff66e1850f9057dd813d6e7be022d9516 (commit)
from 556a2007974ed39a68c87a8b5181f8057ecd0d6f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2d1f3a4db65d2731a695dee6b973accea8b9adc0
commit 2d1f3a4db65d2731a695dee6b973accea8b9adc0
Author: Ulrich Drepper <drepper@gmail.com>
Date: Sat Oct 15 11:11:12 2011 -0400
Fix WS
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index b3a2ca1..e35a23e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -8,7 +8,7 @@
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
+ The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
@@ -31,8 +31,8 @@
# endif
/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
*/
atom_text_section
@@ -1625,7 +1625,7 @@ L(more32bytes):
# else
jmp L(36bytes)
# endif
-
+
ALIGN (4)
L(more40bytes):
cmp $40, %ecx
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=be13f7bff66e1850f9057dd813d6e7be022d9516
commit be13f7bff66e1850f9057dd813d6e7be022d9516
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Sat Oct 15 11:10:08 2011 -0400
Optimized memcmp and wmemcmp for x86-64 and x86-32
diff --git a/ChangeLog b/ChangeLog
index 49f091a..414611a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2011-09-27 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/x86_64/multiarch/Makefile: (sysdep_routines): Add
+ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+ * sysdeps/x86_64/multiarch/memcmp-ssse3: New file.
+ * sysdeps/x86_64/multiarch/memcmp.S: Update. Add __memcmp_ssse3.
+ * sysdeps/x86_64/multiarch/memcmp-sse4.S: Update.
+ (USE_AS_WMEMCMP): New macro.
+ Fixing indents.
+ * sysdeps/x86_64/multiarch/wmemcmp.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-sse4.S: New file.
+ * sysdeps/x86_64/multiarch/wmemcmp-c.S: New file.
+ * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+ wmemcmp-ssse3 wmemcmp-sse4 wmemcmp-c
+ * sysdeps/i386/i686/multiarch/wmemcmp.S: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-c.c: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/wmemcmp-sse4.S: New file.
+ * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Update.
+ (USE_AS_WMEMCMP): New macro.
+ * sysdeps/i386/i686/multiarch/memcmp-ssse3: Likewise.
+ * sysdeps/string/test-memcmp.c: Update.
+ Fix simple_wmemcmp.
+ Add new tests.
+ * wcsmbs/wmemcmp.c: Update.
+ (WMEMCMP): New macro.
+ Fix overflow bug.
+
2011-10-12 Andreas Jaeger <aj@suse.de>
[BZ #13268]
diff --git a/NEWS b/NEWS
index 7e9b2c1..cdb2973 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,7 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
-* Optimized memchr, memrchr, rawmemchr for x86-64 and x86-32.
+* Optimized memchr, memrchr, rawmemchr, memcmp, wmemcmp for x86-64 and x86-32.
Contributed by Liubov Dmitrieva.
* New interfaces: scandirat, scandirat64
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 4675bd9..f246d3a 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -29,9 +29,21 @@
# define MEMCPY wmemcpy
# define SIMPLE_MEMCMP simple_wmemcmp
# define CHAR wchar_t
-# define MAX_CHAR 256000
-# define UCHAR uint32_t
+# define UCHAR wchar_t
# define CHARBYTES 4
+# define CHAR__MIN WCHAR_MIN
+# define CHAR__MAX WCHAR_MAX
+int
+simple_wmemcmp (const wchar_t *s1, const wchar_t *s2, size_t n)
+{
+ int ret = 0;
+ /* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+ */
+ while (n-- && (ret = *s1 < *s2 ? -1 : *s1 == *s2 ? 0 : 1) == 0) {s1++; s2++;}
+ return ret;
+}
#else
# define MEMCMP memcmp
# define MEMCPY memcpy
@@ -40,18 +52,20 @@
# define MAX_CHAR 255
# define UCHAR unsigned char
# define CHARBYTES 1
-#endif
-
-typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+# define CHAR__MIN CHAR_MIN
+# define CHAR__MAX CHAR_MAX
int
-SIMPLE_MEMCMP (const CHAR *s1, const CHAR *s2, size_t n)
+simple_memcmp (const char *s1, const char *s2, size_t n)
{
int ret = 0;
- while (n-- && (ret = *(UCHAR *) s1++ - *(UCHAR *) s2++) == 0);
+ while (n-- && (ret = *(unsigned char *) s1++ - *(unsigned char *) s2++) == 0);
return ret;
}
+#endif
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
IMPL (SIMPLE_MEMCMP, 0)
IMPL (MEMCMP, 1)
@@ -121,7 +135,7 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
s2 = (CHAR *) (buf2 + align2);
for (i = 0; i < len; i++)
- s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
+ s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
s1[len] = align1;
s2[len] = align2;
@@ -412,8 +426,8 @@ check1 (void)
s2[99] = 1;
s1[100] = 116;
s2[100] = 116;
- s1[101] = -13;
- s2[101] = -13;
+ s1[101] = CHAR__MIN;
+ s2[101] = CHAR__MAX;
s1[102] = -109;
s2[102] = -109;
s1[103] = 1;
@@ -434,8 +448,8 @@ check1 (void)
s2[110] = -109;
s1[111] = 1;
s2[111] = 1;
- s1[112] = 20;
- s2[112] = 20;
+ s1[112] = CHAR__MAX;
+ s2[112] = CHAR__MIN;
s1[113] = -13;
s2[113] = -13;
s1[114] = -109;
@@ -444,9 +458,12 @@ check1 (void)
s2[115] = 1;
n = 116;
- exp_result = SIMPLE_MEMCMP (s1, s2, n);
- FOR_EACH_IMPL (impl, 0)
- check_result (impl, s1, s2, n, exp_result);
+ for (size_t i = 0; i < n; i++)
+ {
+ exp_result = SIMPLE_MEMCMP (s1 + i, s2 + i, n - i);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1 + i, s2 + i, n - i, exp_result);
+ }
}
int
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 8a4c219..98d1ad6 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -17,7 +17,8 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \
wcscmp-sse2 wcscmp-c memchr-sse2 memchr-sse2-bsf \
memrchr-sse2 memrchr-sse2-bsf memrchr-c \
- rawmemchr-sse2 rawmemchr-sse2-bsf
+ rawmemchr-sse2 rawmemchr-sse2-bsf \
+ wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
index b1ed778..1f5dbc1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-sse4.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.2
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.2, wmemcmp with SSE4.2
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,84 +20,97 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_2
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_2
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1 + 4
+# define LEN BLK2 + 4
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
-#ifdef SHARED
-# define JMPTBL(I, B) I - B
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table and adjuested EDX/ESI. Go. */ \
- jmp *%ebx
-
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
- ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
- movl (%esp), %ebx
- ret
-#else
-# define JMPTBL(I, B) I
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ call __i686.get_pc_thunk.bx; \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
+ jmp *%ebx
+# else
+# define JMPTBL(I, B) I
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-#endif
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
.section .text.sse4.2,"ax",@progbits
ENTRY (MEMCMP)
movl BLK1(%esp), %eax
movl BLK2(%esp), %edx
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+# else
cmp $1, %ecx
jbe L(less1bytes)
+# endif
+
pxor %xmm0, %xmm0
cmp $64, %ecx
ja L(64bytesormore)
cmp $8, %ecx
- PUSH (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+# else
jb L(less8bytes)
+ PUSH (%ebx)
+# endif
+
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less8bytes):
mov (%eax), %bl
cmpb (%edx), %bl
@@ -141,22 +154,49 @@ L(less8bytes):
mov 6(%eax), %bl
cmpb 6(%edx), %bl
je L(0bytes)
+
L(nonzero):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(above)
neg %eax
L(above):
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+ .p2align 4
L(0bytes):
- POP (%ebx)
+ POP (%ebx)
xor %eax, %eax
ret
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(0bytesend)
movzbl (%eax), %eax
@@ -164,14 +204,14 @@ L(less1bytes):
sub %edx, %eax
ret
- ALIGN (4)
+ .p2align 4
L(0bytesend):
xor %eax, %eax
ret
-
- ALIGN (4)
+# endif
+ .p2align 4
L(64bytesormore):
- PUSH (%ebx)
+ PUSH (%ebx)
mov %ecx, %ebx
mov $64, %ecx
sub $64, %ebx
@@ -208,7 +248,14 @@ L(64bytesormore_loop):
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
- ALIGN (4)
+# ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+# endif
+ .p2align 4
L(find_16diff):
sub $16, %ecx
L(find_32diff):
@@ -218,9 +265,9 @@ L(find_48diff):
L(find_64diff):
add %ecx, %edx
add %ecx, %eax
- jmp L(16bytes)
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(16bytes):
mov -16(%eax), %ecx
mov -16(%edx), %ebx
@@ -243,8 +290,30 @@ L(4bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(49bytes):
movdqu -49(%eax), %xmm1
movdqu -49(%edx), %xmm2
@@ -285,7 +354,7 @@ L(5bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(50bytes):
mov $-50, %ebx
movdqu -50(%eax), %xmm1
@@ -330,7 +399,7 @@ L(2bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(51bytes):
mov $-51, %ebx
movdqu -51(%eax), %xmm1
@@ -378,8 +447,8 @@ L(1bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(52bytes):
movdqu -52(%eax), %xmm1
movdqu -52(%edx), %xmm2
@@ -402,13 +471,18 @@ L(20bytes):
ptest %xmm2, %xmm0
jnc L(less16bytes)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(53bytes):
movdqu -53(%eax), %xmm1
movdqu -53(%edx), %xmm2
@@ -440,7 +514,7 @@ L(21bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(54bytes):
movdqu -54(%eax), %xmm1
movdqu -54(%edx), %xmm2
@@ -476,7 +550,7 @@ L(22bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(55bytes):
movdqu -55(%eax), %xmm1
movdqu -55(%edx), %xmm2
@@ -513,8 +587,8 @@ L(23bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(56bytes):
movdqu -56(%eax), %xmm1
movdqu -56(%edx), %xmm2
@@ -538,18 +612,27 @@ L(24bytes):
jnc L(less16bytes)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(57bytes):
movdqu -57(%eax), %xmm1
movdqu -57(%edx), %xmm2
@@ -585,7 +668,7 @@ L(25bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(58bytes):
movdqu -58(%eax), %xmm1
movdqu -58(%edx), %xmm2
@@ -627,7 +710,7 @@ L(26bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(59bytes):
movdqu -59(%eax), %xmm1
movdqu -59(%edx), %xmm2
@@ -668,8 +751,8 @@ L(27bytes):
mov $0, %eax
jne L(end)
RETURN
-
- ALIGN (4)
+# endif
+ .p2align 4
L(60bytes):
movdqu -60(%eax), %xmm1
movdqu -60(%edx), %xmm2
@@ -691,22 +774,38 @@ L(28bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
+
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(61bytes):
movdqu -61(%eax), %xmm1
movdqu -61(%edx), %xmm2
@@ -749,7 +848,7 @@ L(29bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(62bytes):
movdqu -62(%eax), %xmm1
movdqu -62(%edx), %xmm2
@@ -792,7 +891,7 @@ L(30bytes):
jne L(end)
RETURN
- ALIGN (4)
+ .p2align 4
L(63bytes):
movdqu -63(%eax), %xmm1
movdqu -63(%edx), %xmm2
@@ -838,8 +937,9 @@ L(31bytes):
mov $0, %eax
jne L(end)
RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(64bytes):
movdqu -64(%eax), %xmm1
movdqu -64(%edx), %xmm2
@@ -863,28 +963,45 @@ L(32bytes):
jnc L(less16bytes)
mov -16(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -16(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -16(%edx), %ecx
+# endif
jne L(find_diff)
mov -12(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -12(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -12(%edx), %ecx
+# endif
jne L(find_diff)
mov -8(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -8(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -8(%edx), %ecx
+# endif
jne L(find_diff)
mov -4(%eax), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%edx), %ebx
cmp %ebx, %ecx
+# else
+ cmp -4(%edx), %ecx
+# endif
mov $0, %eax
jne L(find_diff)
RETURN
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less16bytes):
add %ebx, %eax
add %ebx, %edx
@@ -910,9 +1027,35 @@ L(less16bytes):
mov $0, %eax
jne L(find_diff)
RETURN
+# else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+# endif
- ALIGN (4)
+ .p2align 4
L(find_diff):
+# ifndef USE_AS_WMEMCMP
cmpb %bl, %cl
jne L(end)
cmp %bx, %cx
@@ -923,17 +1066,29 @@ L(find_diff):
jne L(end)
cmp %bx, %cx
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.2,"a",@progbits
- ALIGN (2)
+ .p2align 2
.type L(table_64bytes), @object
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1000,5 +1155,72 @@ L(table_64bytes):
.int JMPTBL (L(62bytes), L(table_64bytes))
.int JMPTBL (L(63bytes), L(table_64bytes))
.int JMPTBL (L(64bytes), L(table_64bytes))
- .size L(table_64bytes), .-L(table_64bytes)
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+# endif
#endif
diff --git a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
index 2e0d15f..eab85c1 100644
--- a/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcmp-ssse3.S
@@ -1,5 +1,5 @@
-/* memcmp with SSSE3
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,47 +20,64 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PARMS 4
+# define BLK1 PARMS
+# define BLK2 BLK1+4
+# define LEN BLK2+4
+# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
+# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
-#define PARMS 4
-#define BLK1 PARMS
-#define BLK2 BLK1+4
-#define LEN BLK2+4
-#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
-#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
- .section .text.ssse3,"ax",@progbits
+ atom_text_section
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
+
+# ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(zero)
+# endif
+
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
+
+# ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
- PUSH (%ebx)
+# endif
+
+ PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
- ALIGN (4)
- CFI_POP (%ebx)
+ CFI_POP (%ebx)
+
+# ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@@ -71,29 +88,30 @@ L(less1bytes):
neg %eax
L(1bytesend):
ret
+# endif
- ALIGN (4)
+ .p2align 4
L(zero):
- mov $0, %eax
+ xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
cfi_remember_state
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
movl %eax, %edi
movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
lea 16(%edi), %edi
- sub $0xffff, %edx
+ sub $0xffff, %edx
lea 16(%esi), %esi
- jnz L(less16bytes)
+ jnz L(less16bytes)
mov %edi, %edx
and $0xf, %edx
xor %edx, %edi
@@ -104,6 +122,7 @@ L(48bytesormore):
jz L(shr_0)
xor %edx, %esi
+# ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@@ -122,7 +141,7 @@ L(48bytesormore):
je L(shr_6)
jmp L(shr_7)
- ALIGN (4)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -139,8 +158,17 @@ L(next_unaligned_table):
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@@ -159,13 +187,13 @@ L(shr_0):
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@@ -205,13 +233,14 @@ L(shr_0_gobble_loop_next):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -235,13 +264,13 @@ L(shr_1):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -288,14 +317,14 @@ L(shr_1_gobble_next):
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -319,13 +348,13 @@ L(shr_2):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -372,13 +401,13 @@ L(shr_2_gobble_next):
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -402,13 +431,13 @@ L(shr_3):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -455,13 +484,14 @@ L(shr_3_gobble_next):
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -485,13 +515,13 @@ L(shr_4):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -538,13 +568,14 @@ L(shr_4_gobble_next):
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -568,13 +599,13 @@ L(shr_5):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -621,13 +652,13 @@ L(shr_5_gobble_next):
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -651,13 +682,13 @@ L(shr_6):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -704,13 +735,13 @@ L(shr_6_gobble_next):
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -734,13 +765,13 @@ L(shr_7):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -787,13 +818,14 @@ L(shr_7_gobble_next):
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -817,13 +849,13 @@ L(shr_8):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -870,13 +902,14 @@ L(shr_8_gobble_next):
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -900,13 +933,13 @@ L(shr_9):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -953,13 +986,13 @@ L(shr_9_gobble_next):
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -983,13 +1016,13 @@ L(shr_10):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1036,13 +1069,13 @@ L(shr_10_gobble_next):
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1066,13 +1099,13 @@ L(shr_11):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1119,13 +1152,14 @@ L(shr_11_gobble_next):
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1149,13 +1183,13 @@ L(shr_12):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1202,13 +1236,14 @@ L(shr_12_gobble_next):
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1232,13 +1267,13 @@ L(shr_13):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1285,13 +1320,13 @@ L(shr_13_gobble_next):
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1315,13 +1350,13 @@ L(shr_14):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1368,13 +1403,13 @@ L(shr_14_gobble_next):
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1398,13 +1433,13 @@ L(shr_15):
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1451,13 +1486,14 @@ L(shr_15_gobble_next):
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+# endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@@ -1465,9 +1501,12 @@ L(exit):
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
+
L(first16bytes):
add %eax, %esi
L(less16bytes):
+
+# ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@@ -1492,61 +1531,61 @@ L(less16bytes):
test $0x40, %dl
jnz L(Byte22)
L(Byte23):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte16):
- movzbl -16(%edi), %eax
- movzbl -16(%esi), %edx
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte17):
- movzbl -15(%edi), %eax
- movzbl -15(%esi), %edx
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte18):
- movzbl -14(%edi), %eax
- movzbl -14(%esi), %edx
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte19):
- movzbl -13(%edi), %eax
- movzbl -13(%esi), %edx
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte20):
- movzbl -12(%edi), %eax
- movzbl -12(%esi), %edx
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte21):
- movzbl -11(%edi), %eax
- movzbl -11(%esi), %edx
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte22):
- movzbl -10(%edi), %eax
- movzbl -10(%esi), %edx
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@@ -1571,20 +1610,69 @@ L(next_24_bytes):
test $0x40, %dh
jnz L(Byte22)
- ALIGN (4)
+ .p2align 4
L(Byte31):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
+# else
+
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %eax
+ cmp -16(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %eax
+ cmp -12(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %eax
+ cmp -8(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %eax
+ cmp -4(%esi), %eax
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+# endif
CFI_PUSH (%ebx)
- ALIGN (4)
+
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
+# ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@@ -1598,13 +1686,17 @@ L(more8bytes):
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
+# ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@@ -1618,13 +1710,17 @@ L(more16bytes):
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
+# ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@@ -1638,13 +1734,17 @@ L(more24bytes):
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
+# ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@@ -1658,11 +1758,35 @@ L(more32bytes):
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
- ALIGN (4)
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
+# ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@@ -1677,23 +1801,7 @@ L(more40bytes):
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-
- ALIGN (4)
+ .p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@@ -1750,11 +1858,64 @@ L(4bytes):
cmp %ebx, %ecx
mov $0, %eax
jne L(find_diff)
- POP (%ebx)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+# else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
+# endif
- ALIGN (4)
+# ifndef USE_AS_WMEMCMP
+
+ .p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@@ -1814,11 +1975,11 @@ L(5bytes):
cmp -1(%edx), %cl
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@@ -1882,11 +2043,11 @@ L(2bytes):
cmp %bh, %ch
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@@ -1953,11 +2114,11 @@ L(3bytes):
cmpb -1(%edx), %al
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@@ -1968,14 +2129,30 @@ L(find_diff):
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
+
+ .p2align 4
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+# else
-END (MEMCMP)
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+# endif
+END (MEMCMP)
#endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..94ff615
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_ia32
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..1a857c7
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_2
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
similarity index 53%
copy from sysdeps/x86_64/multiarch/memcmp.S
copy to sysdeps/i386/i686/multiarch/wmemcmp.S
index 301ab28..5080c14 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -1,5 +1,5 @@
-/* Multiple versions of memcmp
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -22,38 +22,38 @@
#include <init-arch.h>
/* Define multiple versions only for the definition in libc. */
+
#ifndef NOT_IN_libc
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+ __i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
.text
-ENTRY(memcmp)
- .type memcmp, @gnu_indirect_function
- cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
jne 1f
call __init_cpu_features
-1: leaq __memcmp_sse2(%rip), %rax
- testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+1: leal __wmemcmp_ia32@GOTOFF(%ebx), %eax
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
jz 2f
- leaq __memcmp_sse4_1(%rip), %rax
-2: ret
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __memcmp_sse2, @function; \
- .p2align 4; \
- __memcmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
- they will be called without setting up EBX needed for PLT which is
- used by IFUNC. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
+ leal __wmemcmp_ssse3@GOTOFF(%ebx), %eax
+ testl $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(wmemcmp)
#endif
-
-#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a5254dc..e0bb984 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index fc439bb..28dd505 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -1,5 +1,5 @@
-/* memcmp with SSE4.1
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* memcmp with SSE4.1, wmemcmp with SSE4.1
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -20,43 +20,54 @@
#ifndef NOT_IN_libc
-#include <sysdep.h>
-#include "asm-syntax.h"
+# include <sysdep.h>
-#ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-#endif
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse4_1
+# endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
-#define JMPTBL(I, B) (I - B)
+# define JMPTBL(I, B) (I - B)
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
lea TABLE(%rip), %r11; \
movslq (%r11, INDEX, SCALE), %rcx; \
add %r11, %rcx; \
jmp *%rcx; \
ud2
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+# endif
pxor %xmm0, %xmm0
cmp $79, %rdx
ja L(79bytesormore)
+# ifndef USE_AS_WMEMCMP
cmp $1, %rdx
je L(firstbyte)
+# endif
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+# ifndef USE_AS_WMEMCMP
ALIGN (4)
L(firstbyte):
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(79bytesormore):
@@ -308,11 +319,11 @@ L(less32bytesin256):
ALIGN (4)
L(512bytesormore):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -624,11 +635,11 @@ L(less32bytesin256in2alinged):
ALIGN (4)
L(512bytesormorein2aligned):
-#ifdef DATA_CACHE_SIZE_HALF
+# ifdef DATA_CACHE_SIZE_HALF
mov $DATA_CACHE_SIZE_HALF, %r8
-#else
+# else
mov __x86_64_data_cache_size_half(%rip), %r8
-#endif
+# endif
mov %r8, %r9
shr $1, %r8
add %r9, %r8
@@ -667,6 +678,7 @@ L(64bytesormore_loopin2aligned):
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
L(L2_L3_cache_aglined):
sub $64, %rdx
+
ALIGN (4)
L(L2_L3_aligned_128bytes_loop):
prefetchnta 0x1c0(%rdi)
@@ -803,13 +815,19 @@ L(12bytes):
jne L(diffin8bytes)
L(4bytes):
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
L(0bytes):
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
ALIGN (4)
L(65bytes):
movdqu -65(%rdi), %xmm1
@@ -1017,6 +1035,7 @@ L(1bytes):
movzbl -1(%rsi), %ecx
sub %ecx, %eax
ret
+# endif
ALIGN (4)
L(68bytes):
@@ -1047,13 +1066,20 @@ L(20bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(69bytes):
movdqu -69(%rsi), %xmm1
@@ -1161,6 +1187,7 @@ L(23bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
+# endif
ALIGN (4)
L(72bytes):
@@ -1191,13 +1218,16 @@ L(24bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
- mov -8(%rdi), %rax
+
mov -8(%rsi), %rcx
+ mov -8(%rdi), %rax
cmp %rax, %rcx
jne L(diffin8bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(73bytes):
movdqu -73(%rsi), %xmm1
@@ -1312,7 +1342,7 @@ L(27bytes):
jne L(diffin4bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(76bytes):
movdqu -76(%rsi), %xmm1
@@ -1346,13 +1376,19 @@ L(28bytes):
mov -12(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
- mov -4(%rdi), %eax
mov -4(%rsi), %ecx
+# ifndef USE_AS_WMEMCMP
+ mov -4(%rdi), %eax
cmp %eax, %ecx
+# else
+ cmp -4(%rdi), %ecx
+# endif
jne L(diffin4bytes)
xor %eax, %eax
ret
+# ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
ALIGN (4)
L(77bytes):
movdqu -77(%rsi), %xmm1
@@ -1474,7 +1510,7 @@ L(31bytes):
jne L(diffin8bytes)
xor %eax, %eax
ret
-
+# endif
ALIGN (4)
L(64bytes):
movdqu -64(%rdi), %xmm2
@@ -1527,7 +1563,17 @@ L(diffin8bytes):
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
+
+# ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+# endif
+
L(diffin4bytes):
+# ifndef USE_AS_WMEMCMP
cmp %cx, %ax
jne L(diffin2bytes)
shr $16, %ecx
@@ -1546,11 +1592,28 @@ L(end):
and $0xff, %ecx
sub %ecx, %eax
ret
+# else
+
+/* for wmemcmp */
+ mov $1, %eax
+ jl L(nequal_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(nequal_bigger):
+ ret
+
+L(unreal_case):
+ xor %eax, %eax
+ ret
+# endif
END (MEMCMP)
.section .rodata.sse4.1,"a",@progbits
ALIGN (3)
+# ifndef USE_AS_WMEMCMP
L(table_64bytes):
.int JMPTBL (L(0bytes), L(table_64bytes))
.int JMPTBL (L(1bytes), L(table_64bytes))
@@ -1632,4 +1695,87 @@ L(table_64bytes):
.int JMPTBL (L(77bytes), L(table_64bytes))
.int JMPTBL (L(78bytes), L(table_64bytes))
.int JMPTBL (L(79bytes), L(table_64bytes))
+# else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
new file mode 100644
index 0000000..b3a2ca1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -0,0 +1,1997 @@
+/* memcmp with SSSE3, wmemcmp with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_ssse3
+# endif
+
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ atom_text_section
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+ test %rdx, %rdx
+ jz L(equal)
+# endif
+ mov %rdx, %rcx
+ mov %rdi, %rdx
+ cmp $48, %rcx;
+ jae L(48bytesormore) /* LEN => 48 */
+
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+/* ECX >= 32. */
+L(48bytesormore):
+ movdqu (%rdi), %xmm3
+ movdqu (%rsi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(less16bytes)
+ mov %edi, %edx
+ and $0xf, %edx
+ xor %rdx, %rdi
+ sub %rdx, %rsi
+ add %rdx, %rcx
+ mov %esi, %edx
+ and $0xf, %edx
+ jz L(shr_0)
+ xor %rdx, %rsi
+
+# ifndef USE_AS_WMEMCMP
+ cmp $8, %edx
+ jae L(next_unaligned_table)
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $1, %edx
+ je L(shr_1)
+ cmp $2, %edx
+ je L(shr_2)
+ cmp $3, %edx
+ je L(shr_3)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $5, %edx
+ je L(shr_5)
+ cmp $6, %edx
+ je L(shr_6)
+ jmp L(shr_7)
+
+ ALIGN (2)
+L(next_unaligned_table):
+ cmp $8, %edx
+ je L(shr_8)
+ cmp $9, %edx
+ je L(shr_9)
+ cmp $10, %edx
+ je L(shr_10)
+ cmp $11, %edx
+ je L(shr_11)
+ cmp $12, %edx
+ je L(shr_12)
+ cmp $13, %edx
+ je L(shr_13)
+ cmp $14, %edx
+ je L(shr_14)
+ jmp L(shr_15)
+# else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+# endif
+
+ ALIGN (4)
+L(shr_0):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ jae L(shr_0_gobble)
+ xor %eax, %eax
+ movdqa (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+ pand %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_0_gobble):
+ movdqa (%rsi), %xmm0
+ xor %eax, %eax
+ pcmpeqb (%rdi), %xmm0
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm2
+L(shr_0_gobble_loop):
+ pand %xmm0, %xmm2
+ sub $32, %rcx
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ movdqa 32(%rsi), %xmm0
+ movdqa 48(%rsi), %xmm2
+ sbb $0xffff, %edx
+ pcmpeqb 32(%rdi), %xmm0
+ pcmpeqb 48(%rdi), %xmm2
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ jz L(shr_0_gobble_loop)
+
+ pand %xmm0, %xmm2
+ cmp $0, %rcx
+ jge L(next)
+ inc %edx
+ add $32, %rcx
+L(next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm2, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_1):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_1_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $1, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $1, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_1_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $1, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $1, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_1_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $1, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $1, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_1_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_1_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_1_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 1(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+
+ ALIGN (4)
+L(shr_2):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_2_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $2, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $2, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_2_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $2, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $2, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_2_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $2, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $2, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_2_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_2_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_2_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 2(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_3_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $3, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $3, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_3_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $3, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $3, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_3_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $3, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $3, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_3_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_3_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_3_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 3(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_4):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_4_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $4, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $4, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_4_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $4, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $4, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_4_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $4, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $4, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_4_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_4_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_4_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 4(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_5):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_5_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $5, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $5, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_5_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $5, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $5, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_5_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $5, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $5, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_5_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_5_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_5_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 5(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_6_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $6, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $6, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_6_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $6, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $6, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_6_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $6, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $6, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_6_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_6_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_6_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 6(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_7_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $7, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $7, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_7_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $7, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $7, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_7_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $7, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $7, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_7_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_7_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_7_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 7(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_8):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_8_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $8, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $8, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_8_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $8, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $8, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_8_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $8, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $8, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_8_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_8_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_8_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 8(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_9):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_9_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $9, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $9, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_9_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $9, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $9, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_9_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $9, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $9, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_9_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_9_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_9_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 9(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_10_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $10, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $10, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_10_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $10, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $10, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_10_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $10, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $10, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_10_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_10_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_10_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 10(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_11_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $11, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $11, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_11_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $11, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $11, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_11_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $11, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $11, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_11_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_11_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_11_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 11(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# endif
+
+ ALIGN (4)
+L(shr_12):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_12_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $12, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $12, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_12_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $12, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $12, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_12_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $12, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $12, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_12_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_12_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_12_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 12(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+# ifndef USE_AS_WMEMCMP
+
+ ALIGN (4)
+L(shr_13):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_13_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $13, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $13, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_13_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $13, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $13, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_13_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $13, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $13, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_13_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_13_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_13_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 13(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_14_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $14, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $14, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_14_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $14, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $14, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_14_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $14, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $14, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_14_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_14_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_14_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 14(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15):
+ cmp $80, %rcx
+ lea -48(%rcx), %rcx
+ mov %edx, %eax
+ jae L(shr_15_gobble)
+
+ movdqa 16(%rsi), %xmm1
+ movdqa %xmm1, %xmm2
+ palignr $15, (%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm1
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, %xmm2, %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+ add $15, %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+
+ ALIGN (4)
+L(shr_15_gobble):
+ sub $32, %rcx
+ movdqa 16(%rsi), %xmm0
+ palignr $15, (%rsi), %xmm0
+ pcmpeqb (%rdi), %xmm0
+
+ movdqa 32(%rsi), %xmm3
+ palignr $15, 16(%rsi), %xmm3
+ pcmpeqb 16(%rdi), %xmm3
+
+L(shr_15_gobble_loop):
+ pand %xmm0, %xmm3
+ sub $32, %rcx
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+
+ movdqa 64(%rsi), %xmm3
+ palignr $15, 48(%rsi), %xmm3
+ sbb $0xffff, %edx
+ movdqa 48(%rsi), %xmm0
+ palignr $15, 32(%rsi), %xmm0
+ pcmpeqb 32(%rdi), %xmm0
+ lea 32(%rsi), %rsi
+ pcmpeqb 48(%rdi), %xmm3
+
+ lea 32(%rdi), %rdi
+ jz L(shr_15_gobble_loop)
+ pand %xmm0, %xmm3
+
+ cmp $0, %rcx
+ jge L(shr_15_gobble_next)
+ inc %edx
+ add $32, %rcx
+L(shr_15_gobble_next):
+ test %edx, %edx
+ jnz L(exit)
+
+ pmovmskb %xmm3, %edx
+ movdqa %xmm0, %xmm1
+ lea 32(%rdi), %rdi
+ lea 32(%rsi), %rsi
+ sub $0xffff, %edx
+ jnz L(exit)
+
+ lea 15(%rsi), %rsi
+ add %rcx, %rsi
+ add %rcx, %rdi
+ jmp L(less48bytes)
+# endif
+ ALIGN (4)
+L(exit):
+ pmovmskb %xmm1, %r8d
+ sub $0xffff, %r8d
+ jz L(first16bytes)
+ lea -16(%rsi), %rsi
+ lea -16(%rdi), %rdi
+ mov %r8d, %edx
+L(first16bytes):
+ add %rax, %rsi
+L(less16bytes):
+# ifndef USE_AS_WMEMCMP
+ test %dl, %dl
+ jz L(next_24_bytes)
+
+ test $0x01, %dl
+ jnz L(Byte16)
+
+ test $0x02, %dl
+ jnz L(Byte17)
+
+ test $0x04, %dl
+ jnz L(Byte18)
+
+ test $0x08, %dl
+ jnz L(Byte19)
+
+ test $0x10, %dl
+ jnz L(Byte20)
+
+ test $0x20, %dl
+ jnz L(Byte21)
+
+ test $0x40, %dl
+ jnz L(Byte22)
+
+ movzbl -9(%rdi), %eax
+ movzbl -9(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte16):
+ movzbl -16(%rdi), %eax
+ movzbl -16(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte17):
+ movzbl -15(%rdi), %eax
+ movzbl -15(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte18):
+ movzbl -14(%rdi), %eax
+ movzbl -14(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte19):
+ movzbl -13(%rdi), %eax
+ movzbl -13(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte20):
+ movzbl -12(%rdi), %eax
+ movzbl -12(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte21):
+ movzbl -11(%rdi), %eax
+ movzbl -11(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(Byte22):
+ movzbl -10(%rdi), %eax
+ movzbl -10(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(next_24_bytes):
+ lea 8(%rdi), %rdi
+ lea 8(%rsi), %rsi
+ test $0x01, %dh
+ jnz L(Byte16)
+
+ test $0x02, %dh
+ jnz L(Byte17)
+
+ test $0x04, %dh
+ jnz L(Byte18)
+
+ test $0x08, %dh
+ jnz L(Byte19)
+
+ test $0x10, %dh
+ jnz L(Byte20)
+
+ test $0x20, %dh
+ jnz L(Byte21)
+
+ test $0x40, %dh
+ jnz L(Byte22)
+
+ mov -9(%rdi), %eax
+ and $0xff, %eax
+ mov -9(%rsi), %edx
+ and $0xff, %edx
+ sub %edx, %eax
+ ret
+# else
+/* special for wmemcmp */
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(second_double_word):
+ mov -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+ ret
+
+ ALIGN (4)
+L(fourth_double_word):
+ mov -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+ ret
+# endif
+
+ ALIGN (4)
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+ cmp $0, %ecx
+ je L(0bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $1, %ecx
+ je L(1bytes)
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+# else
+ jmp L(4bytes)
+# endif
+
+ ALIGN (4)
+L(more8bytes):
+ cmp $16, %ecx
+ jae L(more16bytes)
+ cmp $8, %ecx
+ je L(8bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $9, %ecx
+ je L(9bytes)
+ cmp $10, %ecx
+ je L(10bytes)
+ cmp $11, %ecx
+ je L(11bytes)
+ cmp $12, %ecx
+ je L(12bytes)
+ cmp $13, %ecx
+ je L(13bytes)
+ cmp $14, %ecx
+ je L(14bytes)
+ jmp L(15bytes)
+# else
+ jmp L(12bytes)
+# endif
+
+ ALIGN (4)
+L(more16bytes):
+ cmp $24, %ecx
+ jae L(more24bytes)
+ cmp $16, %ecx
+ je L(16bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $17, %ecx
+ je L(17bytes)
+ cmp $18, %ecx
+ je L(18bytes)
+ cmp $19, %ecx
+ je L(19bytes)
+ cmp $20, %ecx
+ je L(20bytes)
+ cmp $21, %ecx
+ je L(21bytes)
+ cmp $22, %ecx
+ je L(22bytes)
+ jmp L(23bytes)
+# else
+ jmp L(20bytes)
+# endif
+
+ ALIGN (4)
+L(more24bytes):
+ cmp $32, %ecx
+ jae L(more32bytes)
+ cmp $24, %ecx
+ je L(24bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $25, %ecx
+ je L(25bytes)
+ cmp $26, %ecx
+ je L(26bytes)
+ cmp $27, %ecx
+ je L(27bytes)
+ cmp $28, %ecx
+ je L(28bytes)
+ cmp $29, %ecx
+ je L(29bytes)
+ cmp $30, %ecx
+ je L(30bytes)
+ jmp L(31bytes)
+# else
+ jmp L(28bytes)
+# endif
+
+ ALIGN (4)
+L(more32bytes):
+ cmp $40, %ecx
+ jae L(more40bytes)
+ cmp $32, %ecx
+ je L(32bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $33, %ecx
+ je L(33bytes)
+ cmp $34, %ecx
+ je L(34bytes)
+ cmp $35, %ecx
+ je L(35bytes)
+ cmp $36, %ecx
+ je L(36bytes)
+ cmp $37, %ecx
+ je L(37bytes)
+ cmp $38, %ecx
+ je L(38bytes)
+ jmp L(39bytes)
+# else
+ jmp L(36bytes)
+# endif
+
+ ALIGN (4)
+L(more40bytes):
+ cmp $40, %ecx
+ je L(40bytes)
+# ifndef USE_AS_WMEMCMP
+ cmp $41, %ecx
+ je L(41bytes)
+ cmp $42, %ecx
+ je L(42bytes)
+ cmp $43, %ecx
+ je L(43bytes)
+ cmp $44, %ecx
+ je L(44bytes)
+ cmp $45, %ecx
+ je L(45bytes)
+ cmp $46, %ecx
+ je L(46bytes)
+ jmp L(47bytes)
+
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ movl -44(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ movl -40(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ movl -36(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ movl -32(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ movl -28(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ movl -24(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ movl -20(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ movl -16(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ movl -12(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ movl -8(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ movl -4(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# else
+ ALIGN (4)
+L(44bytes):
+ movl -44(%rdi), %eax
+ cmp -44(%rsi), %eax
+ jne L(find_diff)
+L(40bytes):
+ movl -40(%rdi), %eax
+ cmp -40(%rsi), %eax
+ jne L(find_diff)
+L(36bytes):
+ movl -36(%rdi), %eax
+ cmp -36(%rsi), %eax
+ jne L(find_diff)
+L(32bytes):
+ movl -32(%rdi), %eax
+ cmp -32(%rsi), %eax
+ jne L(find_diff)
+L(28bytes):
+ movl -28(%rdi), %eax
+ cmp -28(%rsi), %eax
+ jne L(find_diff)
+L(24bytes):
+ movl -24(%rdi), %eax
+ cmp -24(%rsi), %eax
+ jne L(find_diff)
+L(20bytes):
+ movl -20(%rdi), %eax
+ cmp -20(%rsi), %eax
+ jne L(find_diff)
+L(16bytes):
+ movl -16(%rdi), %eax
+ cmp -16(%rsi), %eax
+ jne L(find_diff)
+L(12bytes):
+ movl -12(%rdi), %eax
+ cmp -12(%rsi), %eax
+ jne L(find_diff)
+L(8bytes):
+ movl -8(%rdi), %eax
+ cmp -8(%rsi), %eax
+ jne L(find_diff)
+L(4bytes):
+ movl -4(%rdi), %eax
+ cmp -4(%rsi), %eax
+ jne L(find_diff)
+L(0bytes):
+ xor %eax, %eax
+ ret
+# endif
+
+# ifndef USE_AS_WMEMCMP
+ ALIGN (4)
+L(45bytes):
+ movl -45(%rdi), %eax
+ movl -45(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(41bytes):
+ movl -41(%rdi), %eax
+ movl -41(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(37bytes):
+ movl -37(%rdi), %eax
+ movl -37(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(33bytes):
+ movl -33(%rdi), %eax
+ movl -33(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(29bytes):
+ movl -29(%rdi), %eax
+ movl -29(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(25bytes):
+ movl -25(%rdi), %eax
+ movl -25(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(21bytes):
+ movl -21(%rdi), %eax
+ movl -21(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(17bytes):
+ movl -17(%rdi), %eax
+ movl -17(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(13bytes):
+ movl -13(%rdi), %eax
+ movl -13(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(9bytes):
+ movl -9(%rdi), %eax
+ movl -9(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(5bytes):
+ movl -5(%rdi), %eax
+ movl -5(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(46bytes):
+ movl -46(%rdi), %eax
+ movl -46(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(42bytes):
+ movl -42(%rdi), %eax
+ movl -42(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(38bytes):
+ movl -38(%rdi), %eax
+ movl -38(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(34bytes):
+ movl -34(%rdi), %eax
+ movl -34(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(30bytes):
+ movl -30(%rdi), %eax
+ movl -30(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(26bytes):
+ movl -26(%rdi), %eax
+ movl -26(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(22bytes):
+ movl -22(%rdi), %eax
+ movl -22(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(18bytes):
+ movl -18(%rdi), %eax
+ movl -18(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(14bytes):
+ movl -14(%rdi), %eax
+ movl -14(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(10bytes):
+ movl -10(%rdi), %eax
+ movl -10(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(6bytes):
+ movl -6(%rdi), %eax
+ movl -6(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(47bytes):
+ movl -47(%rdi), %eax
+ movl -47(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(43bytes):
+ movl -43(%rdi), %eax
+ movl -43(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(39bytes):
+ movl -39(%rdi), %eax
+ movl -39(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(35bytes):
+ movl -35(%rdi), %eax
+ movl -35(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(31bytes):
+ movl -31(%rdi), %eax
+ movl -31(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(27bytes):
+ movl -27(%rdi), %eax
+ movl -27(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(23bytes):
+ movl -23(%rdi), %eax
+ movl -23(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(19bytes):
+ movl -19(%rdi), %eax
+ movl -19(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%rdi), %eax
+ movl -15(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%rdi), %eax
+ movl -11(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%rdi), %eax
+ movl -7(%rsi), %ecx
+ cmp %ecx, %eax
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmpb %cl, %al
+ jne L(set)
+ cmp %ecx, %eax
+ jne L(set)
+ movzbl -1(%rdi), %eax
+ cmpb -1(%rsi), %al
+ jne L(set)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(find_diff):
+ cmpb %cl, %al
+ jne L(set)
+ cmpw %cx, %ax
+ jne L(set)
+ shr $16, %eax
+ shr $16, %ecx
+ cmpb %cl, %al
+ jne L(set)
+
+/* We get there only if we already know there is a
+difference. */
+
+ cmp %ecx, %eax
+L(set):
+ sbb %eax, %eax
+ sbb $-1, %eax
+ ret
+# else
+
+/* for wmemcmp */
+ ALIGN (4)
+L(find_diff):
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(find_diff_bigger):
+ ret
+# endif
+
+ ALIGN (4)
+L(equal):
+ xor %eax, %eax
+ ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 301ab28..8bf8f3a 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -1,5 +1,5 @@
/* Multiple versions of memcmp
- Copyright (C) 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,11 +29,20 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
-1: leaq __memcmp_sse2(%rip), %rax
- testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 2f
+
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __memcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
leaq __memcmp_sse4_1(%rip), %rax
-2: ret
+ ret
+
+3: leaq __memcmp_ssse3(%rip), %rax
+ ret
+
END(memcmp)
# undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
new file mode 100644
index 0000000..793f059
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define WMEMCMP __wmemcmp_sse2
+#endif
+
+#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
new file mode 100644
index 0000000..b07973a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_sse4_1
+
+#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
new file mode 100644
index 0000000..a41ef95
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_WMEMCMP 1
+#define MEMCMP __wmemcmp_ssse3
+
+#include "memcmp-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
similarity index 55%
copy from sysdeps/x86_64/multiarch/memcmp.S
copy to sysdeps/x86_64/multiarch/wmemcmp.S
index 301ab28..7c3b7ed 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -1,5 +1,5 @@
-/* Multiple versions of memcmp
- Copyright (C) 2010 Free Software Foundation, Inc.
+/* Multiple versions of wmemcmp
+ Copyright (C) 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -24,36 +24,24 @@
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
-ENTRY(memcmp)
- .type memcmp, @gnu_indirect_function
+ENTRY(wmemcmp)
+ .type wmemcmp, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
-1: leaq __memcmp_sse2(%rip), %rax
- testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 2f
- leaq __memcmp_sse4_1(%rip), %rax
-2: ret
-END(memcmp)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __memcmp_sse2, @function; \
- .p2align 4; \
- __memcmp_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
-
-# ifdef SHARED
-# undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
- they will be called without setting up EBX needed for PLT which is
- used by IFUNC. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
-#endif
-#include "../memcmp.S"
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __wmemcmp_sse2(%rip), %rax
+ ret
+
+2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 3f
+ leaq __wmemcmp_sse4_1(%rip), %rax
+ ret
+
+3: leaq __wmemcmp_ssse3(%rip), %rax
+ ret
+
+END(wmemcmp)
+#endif
diff --git a/wcsmbs/wmemcmp.c b/wcsmbs/wmemcmp.c
index c6a321b..e7edc87 100644
--- a/wcsmbs/wmemcmp.c
+++ b/wcsmbs/wmemcmp.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1996, 1997 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997i, 2011 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
@@ -19,9 +19,12 @@
#include <wchar.h>
+#ifndef WMEMCMP
+# define wmemcmp
+#endif
int
-wmemcmp (s1, s2, n)
+WMEMCMP (s1, s2, n)
const wchar_t *s1;
const wchar_t *s2;
size_t n;
@@ -34,19 +37,19 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[1];
c2 = (wint_t) s2[1];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[2];
c2 = (wint_t) s2[2];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
c1 = (wint_t) s1[3];
c2 = (wint_t) s2[3];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
s1 += 4;
s2 += 4;
n -= 4;
@@ -57,7 +60,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@@ -67,7 +70,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
++s1;
++s2;
--n;
@@ -77,7 +80,7 @@ wmemcmp (s1, s2, n)
c1 = (wint_t) s1[0];
c2 = (wint_t) s2[0];
if (c1 - c2 != 0)
- return c1 - c2;
+ return c1 > c2 ? 1 : -1;
}
return 0;
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 29 +
NEWS | 2 +-
string/test-memcmp.c | 47 +-
sysdeps/i386/i686/multiarch/Makefile | 3 +-
sysdeps/i386/i686/multiarch/memcmp-sse4.S | 396 +++++--
sysdeps/i386/i686/multiarch/memcmp-ssse3.S | 565 +++++---
sysdeps/i386/i686/multiarch/wmemcmp-c.c | 5 +
sysdeps/i386/i686/multiarch/wmemcmp-sse4.S | 4 +
sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S | 4 +
sysdeps/i386/i686/multiarch/wmemcmp.S | 59 +
sysdeps/x86_64/multiarch/Makefile | 3 +-
sysdeps/x86_64/multiarch/memcmp-sse4.S | 192 +++-
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1997 +++++++++++++++++++++++++++
sysdeps/x86_64/multiarch/memcmp.S | 19 +-
sysdeps/x86_64/multiarch/wmemcmp-c.c | 5 +
sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 +
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 +
sysdeps/x86_64/multiarch/wmemcmp.S | 47 +
wcsmbs/wmemcmp.c | 21 +-
19 files changed, 3070 insertions(+), 336 deletions(-)
create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-c.c
create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-sse4.S
create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S
create mode 100644 sysdeps/i386/i686/multiarch/wmemcmp.S
create mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp.S
hooks/post-receive
--
GNU C Library master sources