This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.17-354-g87bd9bc
- From: neleai at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 6 Mar 2013 21:31:36 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.17-354-g87bd9bc
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823 (commit)
from b79188d71716b6286866e06add976fe84100595e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823
commit 87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823
Author: Ondrej Bilka <neleai@seznam.cz>
Date: Wed Mar 6 22:27:18 2013 +0100
Revert " * sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation"
This reverts commit b79188d71716b6286866e06add976fe84100595e.
diff --git a/ChangeLog b/ChangeLog
index 4cf29a0..c82ed84 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,26 +1,3 @@
-2013-03-06 Ondrej Bilka <neleai@seznam.cz>
-
- * sysdeps/x86_64/strlen.S: Replace with new SSE2 based
- implementation which is faster on all x86_64 architectures.
- Tested on AMD, Intel Nehalem, SNB, IVB.
- * sysdeps/x86_64/strnlen.S: Likewise.
-
- * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
- Remove all multiarch strlen and strnlen versions.
- * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
- Remove strlen and strnlen related parts.
-
- * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Update.
- Inline strlen part.
- * sysdeps/x86_64/multiarch/strcat-ssse3.S: Likewise.
-
- * sysdeps/x86_64/multiarch/strlen.S: Remove.
- * sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: Likewise.
- * sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: Likewise.
- * sysdeps/x86_64/multiarch/strlen-sse4.S: Likewise.
- * sysdeps/x86_64/multiarch/strnlen.S: Likewise.
- * sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: Likewise.
-
2013-03-06 Patsy Franklin <pfrankli@redhat.com>
* io/fcntl.h: Added a comment about AT_EACCESS and AT_REMOVEDIR.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 67686ad..dd6c27d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,12 +10,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 memset-x86-64 strcat-ssse3 strncat-ssse3\
+ strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
+ strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 848991e..643cb2d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -187,6 +187,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/strnlen.S. */
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/strpbrk.S. */
IFUNC_IMPL (i, name, strpbrk,
IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
@@ -257,6 +262,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
IFUNC_IMPL (i, name, strncmp,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 6d9951e..72bb609 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -34,233 +34,10 @@ ENTRY (STRCAT)
mov %rdx, %r8
# endif
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- pmovmskb %xmm0, %edx
- and %r10d, %edx
- jnz L(exit)
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-sse2-pminub.S"
+# undef RETURN
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- pcmpeqb 80(%rax), %xmm0
- add $64, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit64)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 80(%rax), %xmm0
- add $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm1
- add $16, %rax
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm2
- add $16, %rax
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $0x3f, %rax
- jz L(align64_loop)
-
- pcmpeqb 16(%rax), %xmm3
- add $16, %rax
- pmovmskb %xmm3, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $16, %rax
- .p2align 4
- L(align64_loop):
- movaps (%rax), %xmm4
- pminub 16(%rax), %xmm4
- movaps 32(%rax), %xmm5
- pminub 48(%rax), %xmm5
- add $64, %rax
- pminub %xmm4, %xmm5
- pcmpeqb %xmm0, %xmm5
- pmovmskb %xmm5, %edx
- test %edx, %edx
- jz L(align64_loop)
-
- pcmpeqb -64(%rax), %xmm0
- sub $80, %rax
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $16, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $32, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $48, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit64):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $64, %rax
-
- .p2align 4
L(StartStrcpyPart):
lea (%r9, %rax), %rdi
mov %rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index 901e66f..fea9d11 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -33,317 +33,11 @@ ENTRY (STRCAT)
mov %rdx, %r8
# endif
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
+# define RETURN jmp L(StartStrcpyPart)
+# include "strlen-sse2-no-bsf.S"
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
+# undef RETURN
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
L(StartStrcpyPart):
mov %rsi, %rcx
lea (%rdi, %rax), %rdx
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
new file mode 100644
index 0000000..ff2ab70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
@@ -0,0 +1,685 @@
+/* strlen SSE2 without bsf
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* only for strlen case we don't use optimized version for STATIC build just for SHARED */
+
+#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc
+
+# ifndef USE_AS_STRCAT
+
+# include <sysdep.h>
+
+# define RETURN ret
+
+# ifndef STRLEN
+# define STRLEN __strlen_sse2_no_bsf
+# endif
+
+ atom_text_section
+ENTRY (STRLEN)
+# endif
+ xor %eax, %eax
+# ifdef USE_AS_STRNLEN
+ mov %rsi, %r8
+ sub $4, %rsi
+ jbe L(len_less4_prolog)
+# endif
+ cmpb $0, (%rdi)
+ jz L(exit_tail0)
+ cmpb $0, 1(%rdi)
+ jz L(exit_tail1)
+ cmpb $0, 2(%rdi)
+ jz L(exit_tail2)
+ cmpb $0, 3(%rdi)
+ jz L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %rsi
+ jbe L(len_less8_prolog)
+# endif
+
+ cmpb $0, 4(%rdi)
+ jz L(exit_tail4)
+ cmpb $0, 5(%rdi)
+ jz L(exit_tail5)
+ cmpb $0, 6(%rdi)
+ jz L(exit_tail6)
+ cmpb $0, 7(%rdi)
+ jz L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %rsi
+ jbe L(len_less12_prolog)
+# endif
+
+ cmpb $0, 8(%rdi)
+ jz L(exit_tail8)
+ cmpb $0, 9(%rdi)
+ jz L(exit_tail9)
+ cmpb $0, 10(%rdi)
+ jz L(exit_tail10)
+ cmpb $0, 11(%rdi)
+ jz L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+ sub $4, %rsi
+ jbe L(len_less16_prolog)
+# endif
+
+ cmpb $0, 12(%rdi)
+ jz L(exit_tail12)
+ cmpb $0, 13(%rdi)
+ jz L(exit_tail13)
+ cmpb $0, 14(%rdi)
+ jz L(exit_tail14)
+ cmpb $0, 15(%rdi)
+ jz L(exit_tail15)
+ pxor %xmm0, %xmm0
+ lea 16(%rdi), %rcx
+ lea 16(%rdi), %rax
+ and $-16, %rax
+
+# ifdef USE_AS_STRNLEN
+ and $15, %rdi
+ add %rdi, %rsi
+ sub $64, %rsi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %rsi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %rsi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ sub $64, %rsi
+ jbe L(len_less64)
+# endif
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+ pcmpeqb (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ lea 16(%rax), %rax
+ jnz L(exit)
+
+# ifdef USE_AS_STRNLEN
+ mov %rax, %rdx
+ and $63, %rdx
+ add %rdx, %rsi
+# endif
+
+ and $-0x40, %rax
+
+ .p2align 4
+L(aligned_64):
+# ifdef USE_AS_STRNLEN
+ sub $64, %rsi
+ jbe L(len_less64)
+# endif
+ pcmpeqb (%rax), %xmm0
+ pcmpeqb 16(%rax), %xmm1
+ pcmpeqb 32(%rax), %xmm2
+ pcmpeqb 48(%rax), %xmm3
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %r11d
+ pmovmskb %xmm2, %r10d
+ pmovmskb %xmm3, %r9d
+ or %edx, %r9d
+ or %r11d, %r9d
+ or %r10d, %r9d
+ lea 64(%rax), %rax
+ jz L(aligned_64)
+
+ test %edx, %edx
+ jnz L(aligned_64_exit_16)
+ test %r11d, %r11d
+ jnz L(aligned_64_exit_32)
+ test %r10d, %r10d
+ jnz L(aligned_64_exit_48)
+L(aligned_64_exit_64):
+ pmovmskb %xmm3, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_48):
+ lea -16(%rax), %rax
+ mov %r10d, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_32):
+ lea -32(%rax), %rax
+ mov %r11d, %edx
+ jmp L(aligned_64_exit)
+L(aligned_64_exit_16):
+ lea -48(%rax), %rax
+L(aligned_64_exit):
+L(exit):
+ sub %rcx, %rax
+ test %dl, %dl
+ jz L(exit_high)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+
+ test $0x02, %dl
+ jnz L(exit_tail1)
+
+ test $0x04, %dl
+ jnz L(exit_tail2)
+
+ test $0x08, %dl
+ jnz L(exit_tail3)
+
+ test $0x10, %dl
+ jnz L(exit_tail4)
+
+ test $0x20, %dl
+ jnz L(exit_tail5)
+
+ test $0x40, %dl
+ jnz L(exit_tail6)
+ add $7, %eax
+L(exit_tail0):
+ RETURN
+
+L(exit_high):
+ add $8, %eax
+ test $0x01, %dh
+ jnz L(exit_tail0)
+
+ test $0x02, %dh
+ jnz L(exit_tail1)
+
+ test $0x04, %dh
+ jnz L(exit_tail2)
+
+ test $0x08, %dh
+ jnz L(exit_tail3)
+
+ test $0x10, %dh
+ jnz L(exit_tail4)
+
+ test $0x20, %dh
+ jnz L(exit_tail5)
+
+ test $0x40, %dh
+ jnz L(exit_tail6)
+ add $7, %eax
+ RETURN
+
+# ifdef USE_AS_STRNLEN
+
+ .p2align 4
+L(len_less64):
+ pxor %xmm0, %xmm0
+ add $64, %rsi
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%rax), %rax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %rsi
+ jbe L(return_start_len)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%rax), %rax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %rsi
+ jbe L(return_start_len)
+
+ pcmpeqb (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%rax), %rax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %rsi
+ jbe L(return_start_len)
+
+ pcmpeqb (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%rax), %rax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ mov %r8, %rax
+ ret
+
+ .p2align 4
+L(strnlen_exit):
+ sub %rcx, %rax
+
+ test %dl, %dl
+ jz L(strnlen_exit_high)
+ mov %dl, %cl
+ and $15, %cl
+ jz L(strnlen_exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(strnlen_exit_tail1)
+ test $0x04, %dl
+ jnz L(strnlen_exit_tail2)
+ sub $4, %rsi
+ jb L(return_start_len)
+ lea 3(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_8):
+ test $0x10, %dl
+ jnz L(strnlen_exit_tail4)
+ test $0x20, %dl
+ jnz L(strnlen_exit_tail5)
+ test $0x40, %dl
+ jnz L(strnlen_exit_tail6)
+ sub $8, %rsi
+ jb L(return_start_len)
+ lea 7(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(strnlen_exit_high_8)
+ test $0x01, %dh
+ jnz L(strnlen_exit_tail8)
+ test $0x02, %dh
+ jnz L(strnlen_exit_tail9)
+ test $0x04, %dh
+ jnz L(strnlen_exit_tail10)
+ sub $12, %rsi
+ jb L(return_start_len)
+ lea 11(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_high_8):
+ test $0x10, %dh
+ jnz L(strnlen_exit_tail12)
+ test $0x20, %dh
+ jnz L(strnlen_exit_tail13)
+ test $0x40, %dh
+ jnz L(strnlen_exit_tail14)
+ sub $16, %rsi
+ jb L(return_start_len)
+ lea 15(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail1):
+ sub $2, %rsi
+ jb L(return_start_len)
+ lea 1(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail2):
+ sub $3, %rsi
+ jb L(return_start_len)
+ lea 2(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail4):
+ sub $5, %rsi
+ jb L(return_start_len)
+ lea 4(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail5):
+ sub $6, %rsi
+ jb L(return_start_len)
+ lea 5(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail6):
+ sub $7, %rsi
+ jb L(return_start_len)
+ lea 6(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail8):
+ sub $9, %rsi
+ jb L(return_start_len)
+ lea 8(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail9):
+ sub $10, %rsi
+ jb L(return_start_len)
+ lea 9(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail10):
+ sub $11, %rsi
+ jb L(return_start_len)
+ lea 10(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail12):
+ sub $13, %rsi
+ jb L(return_start_len)
+ lea 12(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail13):
+ sub $14, %rsi
+ jb L(return_start_len)
+ lea 13(%eax), %eax
+ ret
+
+ .p2align 4
+L(strnlen_exit_tail14):
+ sub $15, %rsi
+ jb L(return_start_len)
+ lea 14(%eax), %eax
+ ret
+
+ .p2align 4
+L(return_start_len):
+ mov %r8, %rax
+ ret
+
+/* for prolog only */
+
+ .p2align 4
+L(len_less4_prolog):
+ add $4, %rsi
+ jz L(exit_tail0)
+
+ cmpb $0, (%rdi)
+ jz L(exit_tail0)
+ cmp $1, %esi
+ je L(exit_tail1)
+
+ cmpb $0, 1(%rdi)
+ jz L(exit_tail1)
+ cmp $2, %esi
+ je L(exit_tail2)
+
+ cmpb $0, 2(%rdi)
+ jz L(exit_tail2)
+ cmp $3, %esi
+ je L(exit_tail3)
+
+ cmpb $0, 3(%rdi)
+ jz L(exit_tail3)
+ mov $4, %eax
+ ret
+
+ .p2align 4
+L(len_less8_prolog):
+ add $4, %rsi
+
+ cmpb $0, 4(%rdi)
+ jz L(exit_tail4)
+ cmp $1, %esi
+ je L(exit_tail5)
+
+ cmpb $0, 5(%rdi)
+ jz L(exit_tail5)
+ cmp $2, %esi
+ je L(exit_tail6)
+
+ cmpb $0, 6(%rdi)
+ jz L(exit_tail6)
+ cmp $3, %esi
+ je L(exit_tail7)
+
+ cmpb $0, 7(%rdi)
+ jz L(exit_tail7)
+ mov $8, %eax
+ ret
+
+ .p2align 4
+L(len_less12_prolog):
+ add $4, %rsi
+
+ cmpb $0, 8(%rdi)
+ jz L(exit_tail8)
+ cmp $1, %esi
+ je L(exit_tail9)
+
+ cmpb $0, 9(%rdi)
+ jz L(exit_tail9)
+ cmp $2, %esi
+ je L(exit_tail10)
+
+ cmpb $0, 10(%rdi)
+ jz L(exit_tail10)
+ cmp $3, %esi
+ je L(exit_tail11)
+
+ cmpb $0, 11(%rdi)
+ jz L(exit_tail11)
+ mov $12, %eax
+ ret
+
+ .p2align 4
+L(len_less16_prolog):
+ add $4, %rsi
+
+ cmpb $0, 12(%rdi)
+ jz L(exit_tail12)
+ cmp $1, %esi
+ je L(exit_tail13)
+
+ cmpb $0, 13(%rdi)
+ jz L(exit_tail13)
+ cmp $2, %esi
+ je L(exit_tail14)
+
+ cmpb $0, 14(%rdi)
+ jz L(exit_tail14)
+ cmp $3, %esi
+ je L(exit_tail15)
+
+ cmpb $0, 15(%rdi)
+ jz L(exit_tail15)
+ mov $16, %eax
+ ret
+# endif
+
+ .p2align 4
+L(exit_tail1):
+ add $1, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail2):
+ add $2, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail3):
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail4):
+ add $4, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail5):
+ add $5, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail6):
+ add $6, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail7):
+ add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail8):
+ add $8, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail9):
+ add $9, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail10):
+ add $10, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail11):
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail12):
+ add $12, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail13):
+ add $13, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail14):
+ add $14, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail15):
+ add $15, %eax
+# ifndef USE_AS_STRCAT
+ RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
similarity index 88%
copy from sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
copy to sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
index 6d9951e..cc4bb57 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
@@ -1,4 +1,4 @@
-/* strcat with SSE2
+/* strlen SSE2
Copyright (C) 2011-2013 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -17,23 +17,18 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef NOT_IN_libc
+#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
-# ifndef STRCAT
-# define STRCAT __strcat_sse2_unaligned
-# endif
+# include <sysdep.h>
-# define USE_AS_STRCAT
+# define RETURN ret
-.text
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
+ .section .text.sse2,"ax",@progbits
+ENTRY (__strlen_sse2_pminub)
+# endif
xor %rax, %rax
mov %edi, %ecx
and $0x3f, %ecx
@@ -58,7 +53,6 @@ L(next):
pmovmskb %xmm0, %edx
and %r10d, %edx
jnz L(exit)
-
L(align16_start):
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
@@ -147,6 +141,7 @@ L(align16_start):
test %edx, %edx
jnz L(exit64)
+
test $0x3f, %rax
jz L(align64_loop)
@@ -197,6 +192,7 @@ L(align16_start):
test %edx, %edx
jz L(align64_loop)
+
pcmpeqb -64(%rax), %xmm0
sub $80, %rax
pmovmskb %xmm0, %edx
@@ -219,7 +215,7 @@ L(align16_start):
bsf %rdx, %rdx
add %rdx, %rax
add $64, %rax
- jmp L(StartStrcpyPart)
+ RETURN
.p2align 4
L(exit):
@@ -227,50 +223,37 @@ L(exit):
L(exit_less16):
bsf %rdx, %rdx
add %rdx, %rax
- jmp L(StartStrcpyPart)
-
+ RETURN
.p2align 4
L(exit16):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $16, %rax
- jmp L(StartStrcpyPart)
-
+ RETURN
.p2align 4
L(exit32):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $32, %rax
- jmp L(StartStrcpyPart)
-
+ RETURN
.p2align 4
L(exit48):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $48, %rax
- jmp L(StartStrcpyPart)
-
+ RETURN
.p2align 4
L(exit64):
sub %rdi, %rax
bsf %rdx, %rdx
add %rdx, %rax
add $64, %rax
+# ifndef USE_AS_STRCAT
+ RETURN
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
+END (__strlen_sse2_pminub)
# endif
-
-# include "strcpy-sse2-unaligned.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000..8d685df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,84 @@
+/* strlen with SSE4
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
+ movq %rdi, %r8
+ andq $~15, %rdi
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
+ leaq (%rdi,%rcx), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
+ bsfl %edx, %eax
+ addq %rdi, %rax
+ ret
+
+END (__strlen_sse42)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
new file mode 100644
index 0000000..ab29cef
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -0,0 +1,68 @@
+/* Multiple versions of strlen(str) -- determine the length of the string STR.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc and for
+ the DSO. In static binaries we need strlen before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .text
+ENTRY(strlen)
+ .type strlen, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strlen_sse2_pminub(%rip), %rax
+ testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+ jnz 2f
+ leaq __strlen_sse2(%rip), %rax
+ testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+ jz 2f
+ leaq __strlen_sse42(%rip), %rax
+ ret
+2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+ jz 3f
+ leaq __strlen_sse2_no_bsf(%rip), %rax
+3: ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strlen_sse2, @function; \
+ .align 16; \
+ .globl __strlen_sse2; \
+ .hidden __strlen_sse2; \
+ __strlen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_sse2
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
new file mode 100644
index 0000000..248328d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2_no_bsf
+#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
new file mode 100644
index 0000000..124f845
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.S
@@ -0,0 +1,57 @@
+/* multiple version of strnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+
+ .text
+ENTRY(__strnlen)
+ .type __strnlen, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __strnlen_sse2(%rip), %rax
+ testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+ jz 2f
+ leaq __strnlen_sse2_no_bsf(%rip), %rax
+2: ret
+END(__strnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strnlen_sse2, @function; \
+ .align 16; \
+ .globl __strnlen_sse2; \
+ .hidden __strnlen_sse2; \
+ __strnlen_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
+
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index e82fe8d..4bdca0a 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,6 @@
-/* SSE2 version of strlen.
- Copyright (C) 2012, 2013 Free Software Foundation, Inc.
+/* strlen(str) -- determine the length of the string STR.
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -18,217 +19,83 @@
#include <sysdep.h>
-/* Long lived register are
- strlen(s), strnlen(s, n):
- %xmm11 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
-.text
+ .text
ENTRY(strlen)
-
-#define FIND_ZERO \
- pcmpeqb (%rax), %xmm8; \
- pcmpeqb 16(%rax), %xmm9; \
- pcmpeqb 32(%rax), %xmm10; \
- pcmpeqb 48(%rax), %xmm11; \
- pmovmskb %xmm8, %esi; \
- pmovmskb %xmm9, %edx; \
- pmovmskb %xmm10, %r8d; \
- pmovmskb %xmm11, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %rsi, %rsi
- jne L(n_nonzero)
xor %rax, %rax
- ret
-L(n_nonzero):
-
-/* Initialize long lived registers. */
-
- add %rdi, %rsi
- mov %rsi, %r10
- and $-64, %r10
- mov %rsi, %r11
-#endif
-
- pxor %xmm8, %xmm8
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
ja L(next)
-
-#ifdef AS_STRNLEN
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- movdqu (%rax), %xmm12
- pcmpeqb %xmm8, %xmm12
- pmovmskb %xmm12, %edx
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
test %edx, %edx
- je L(next48_bytes)
- bsfq %rdx, %rax
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- pcmpeqb 16(%rax), %xmm9;
- pcmpeqb 32(%rax), %xmm10;
- pcmpeqb 48(%rax), %xmm11;
- pmovmskb %xmm9, %edx;
- pmovmskb %xmm10, %r8d;
- pmovmskb %xmm11, %ecx;
- salq $16, %rdx;
- salq $16, %rcx;
- orq %r8, %rcx;
- salq $32, %rcx;
- orq %rcx, %rdx;
-#endif
-
- PROLOG(loop)
-
- .p2align 4
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
L(next):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm9, %xmm9
- pxor %xmm10, %xmm10
- pxor %xmm11, %xmm11
-#ifdef AS_STRNLEN
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %esi
+ sub %rax, %rcx
+ shl %cl, %esi
+ pmovmskb %xmm0, %edx
+ and %esi, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
.p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
+L(align16_loop):
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first)
- pxor %xmm8, %xmm8
- FIND_ZERO
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 64(%rax), %rax
+ test %edx, %edx
+ jz L(align16_loop)
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
ret
-
.p2align 4
-L(exit):
- pxor %xmm8, %xmm8
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 16(%rdx,%rax), %rax
ret
-
-#else
.p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm8
- pminub 80(%rax), %xmm8
- pminub 96(%rax), %xmm8
- pminub 112(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm8
- pminub 16(%rax), %xmm8
- pminub 32(%rax), %xmm8
- pminub 48(%rax), %xmm8
- pcmpeqb %xmm11, %xmm8
- pmovmskb %xmm8, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 32(%rdx,%rax), %rax
+ ret
.p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm8, %xmm8
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ lea 48(%rdx,%rax), %rax
ret
-
-#endif
-
END(strlen)
-#ifndef AS_STRLEN
libc_hidden_builtin_def (strlen)
-#endif
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index d3c43ac..6e53503 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,6 +1,63 @@
-#define AS_STRNLEN
-#define strlen __strnlen
-#include "strlen.S"
+/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
+ This file is part of the GNU C Library.
-weak_alias (__strnlen, strnlen);
-libc_hidden_builtin_def (strnlen)
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+ .text
+ENTRY(__strnlen)
+ movq %rsi, %rax
+ testq %rsi, %rsi
+ jz 3f
+ pxor %xmm2, %xmm2
+ movq %rdi, %rcx
+ movq %rdi, %r8
+ movq $16, %r9
+ andq $~15, %rdi
+ movdqa %xmm2, %xmm1
+ pcmpeqb (%rdi), %xmm2
+ orl $0xffffffff, %r10d
+ subq %rdi, %rcx
+ shll %cl, %r10d
+ subq %rcx, %r9
+ pmovmskb %xmm2, %edx
+ andl %r10d, %edx
+ jnz 1f
+ subq %r9, %rsi
+ jbe 3f
+
+2: movdqa 16(%rdi), %xmm0
+ leaq 16(%rdi), %rdi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jnz 1f
+ subq $16, %rsi
+ jnbe 2b
+3: ret
+
+1: subq %r8, %rdi
+ bsfl %edx, %edx
+ addq %rdi, %rdx
+ cmpq %rdx, %rax
+ cmovnbq %rdx, %rax
+ ret
+END(__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 23 -
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 13 +
sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 229 +-------
sysdeps/x86_64/multiarch/strcat-ssse3.S | 312 +----------
sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S | 685 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 259 ++++++++
sysdeps/x86_64/multiarch/strlen-sse4.S | 84 +++
sysdeps/x86_64/multiarch/strlen.S | 68 +++
sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 +
sysdeps/x86_64/multiarch/strnlen.S | 57 ++
sysdeps/x86_64/strlen.S | 263 ++-------
sysdeps/x86_64/strnlen.S | 67 ++-
13 files changed, 1306 insertions(+), 763 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
create mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
create mode 100644 sysdeps/x86_64/multiarch/strlen.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen.S
hooks/post-receive
--
GNU C Library master sources