This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.17-354-g87bd9bc


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823 (commit)
      from  b79188d71716b6286866e06add976fe84100595e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823

commit 87bd9bc4bd2a49a441bb9ba744c9ddb0c9434823
Author: Ondrej Bilka <neleai@seznam.cz>
Date:   Wed Mar 6 22:27:18 2013 +0100

    Revert "	* sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation"
    
    This reverts commit b79188d71716b6286866e06add976fe84100595e.

diff --git a/ChangeLog b/ChangeLog
index 4cf29a0..c82ed84 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,26 +1,3 @@
-2013-03-06  Ondrej Bilka  <neleai@seznam.cz>
-
-	* sysdeps/x86_64/strlen.S: Replace with new SSE2 based
-	implementation which is faster on all x86_64 architectures.
-	Tested on AMD, Intel Nehalem, SNB, IVB.
-	* sysdeps/x86_64/strnlen.S: Likewise.
-
-	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
-	Remove all multiarch strlen and strnlen versions.
-	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
-	Remove strlen and strnlen related parts.
-
-	* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Update.
-	Inline strlen part.
-	* sysdeps/x86_64/multiarch/strcat-ssse3.S: Likewise.
-
-	* sysdeps/x86_64/multiarch/strlen.S: Remove.
-	* sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: Likewise.
-	* sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: Likewise.
-	* sysdeps/x86_64/multiarch/strlen-sse4.S: Likewise.
-	* sysdeps/x86_64/multiarch/strnlen.S: Likewise.
-	* sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: Likewise.
-
 2013-03-06  Patsy Franklin  <pfrankli@redhat.com>
 
 	* io/fcntl.h: Added a comment about AT_EACCESS and AT_REMOVEDIR.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 67686ad..dd6c27d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,12 +10,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
-		   strncase_l-ssse3 memset-x86-64 strcat-ssse3 strncat-ssse3\
+		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
+		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
+		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 848991e..643cb2d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -187,6 +187,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
 	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
@@ -257,6 +262,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 6d9951e..72bb609 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -34,233 +34,10 @@ ENTRY (STRCAT)
 	mov	%rdx, %r8
 # endif
 
-	xor	%rax, %rax
-	mov	%edi, %ecx
-	and	$0x3f, %ecx
-	pxor	%xmm0, %xmm0
-	cmp	$0x30, %ecx
-	ja	L(next)
-	movdqu	(%rdi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_less16)
-	mov	%rdi, %rax
-	and	$-16, %rax
-	jmp	L(align16_start)
-L(next):
-	mov	%rdi, %rax
-	and	$-16, %rax
-	pcmpeqb	(%rax), %xmm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	pmovmskb %xmm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2-pminub.S"
+# undef RETURN
 
-L(align16_start):
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	pcmpeqb	16(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$80, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm1
-	add	$16, %rax
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm2
-	add	$16, %rax
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm3
-	add	$16, %rax
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$16, %rax
-	.p2align 4
-	L(align64_loop):
-	movaps	(%rax),	%xmm4
-	pminub	16(%rax),	%xmm4
-	movaps	32(%rax),	%xmm5
-	pminub	48(%rax),	%xmm5
-	add	$64,	%rax
-	pminub	%xmm4,	%xmm5
-	pcmpeqb	%xmm0,	%xmm5
-	pmovmskb %xmm5,	%edx
-	test	%edx,	%edx
-	jz	L(align64_loop)
-
-	pcmpeqb	-64(%rax), %xmm0
-	sub	$80,	%rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$64, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_less16):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit16):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$16, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit32):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$32, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit48):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$48, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit64):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$64, %rax
-
-	.p2align 4
 L(StartStrcpyPart):
 	lea	(%r9, %rax), %rdi
 	mov	%rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index 901e66f..fea9d11 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -33,317 +33,11 @@ ENTRY (STRCAT)
 	mov	%rdx, %r8
 # endif
 
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
+# define RETURN  jmp L(StartStrcpyPart)
+# include "strlen-sse2-no-bsf.S"
 
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
+# undef RETURN
 
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
 L(StartStrcpyPart):
 	mov	%rsi, %rcx
 	lea	(%rdi, %rax), %rdx
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
new file mode 100644
index 0000000..ff2ab70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
@@ -0,0 +1,685 @@
+/* strlen SSE2 without bsf
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* only for strlen case we don't use optimized version for STATIC build just for SHARED */
+
+#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc
+
+# ifndef USE_AS_STRCAT
+
+#  include <sysdep.h>
+
+#  define RETURN	ret
+
+#  ifndef STRLEN
+#   define STRLEN	__strlen_sse2_no_bsf
+#  endif
+
+	atom_text_section
+ENTRY (STRLEN)
+# endif
+	xor	%eax, %eax
+#  ifdef USE_AS_STRNLEN
+	mov	%rsi, %r8
+	sub	$4, %rsi
+	jbe	L(len_less4_prolog)
+#  endif
+	cmpb	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%rdi)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%rdi)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%rdi)
+	jz	L(exit_tail3)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %rsi
+	jbe	L(len_less8_prolog)
+# endif
+
+	cmpb	$0, 4(%rdi)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%rdi)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%rdi)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%rdi)
+	jz	L(exit_tail7)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %rsi
+	jbe	L(len_less12_prolog)
+# endif
+
+	cmpb	$0, 8(%rdi)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%rdi)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%rdi)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%rdi)
+	jz	L(exit_tail11)
+
+# ifdef USE_AS_STRNLEN
+	sub	$4, %rsi
+	jbe	L(len_less16_prolog)
+# endif
+
+	cmpb	$0, 12(%rdi)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%rdi)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%rdi)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%rdi)
+	jz	L(exit_tail15)
+	pxor	%xmm0, %xmm0
+	lea	16(%rdi), %rcx
+	lea	16(%rdi), %rax
+	and	$-16, %rax
+
+# ifdef USE_AS_STRNLEN
+	and	$15, %rdi
+	add	%rdi, %rsi
+	sub	$64, %rsi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %rsi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %rsi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	sub	$64, %rsi
+	jbe	L(len_less64)
+# endif
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+# ifdef USE_AS_STRNLEN
+	mov	%rax, %rdx
+	and	$63, %rdx
+	add	%rdx, %rsi
+# endif
+
+	and	$-0x40, %rax
+
+	.p2align 4
+L(aligned_64):
+# ifdef USE_AS_STRNLEN
+	sub	$64, %rsi
+	jbe	L(len_less64)
+# endif
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %r9d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
+	lea	64(%rax), %rax
+	jz	L(aligned_64)
+
+	test	%edx, %edx
+	jnz	L(aligned_64_exit_16)
+	test	%r11d, %r11d
+	jnz	L(aligned_64_exit_32)
+	test	%r10d, %r10d
+	jnz	L(aligned_64_exit_48)
+L(aligned_64_exit_64):
+	pmovmskb %xmm3, %edx
+	jmp	L(aligned_64_exit)
+L(aligned_64_exit_48):
+	lea	-16(%rax), %rax
+	mov	%r10d, %edx
+	jmp	L(aligned_64_exit)
+L(aligned_64_exit_32):
+	lea	-32(%rax), %rax
+	mov	%r11d, %edx
+	jmp	L(aligned_64_exit)
+L(aligned_64_exit_16):
+	lea	-48(%rax), %rax
+L(aligned_64_exit):
+L(exit):
+	sub	%rcx, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dl
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+L(exit_tail0):
+	RETURN
+
+L(exit_high):
+	add	$8, %eax
+	test	$0x01, %dh
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dh
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dh
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dh
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dh
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dh
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dh
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	RETURN
+
+# ifdef USE_AS_STRNLEN
+
+	.p2align 4
+L(len_less64):
+	pxor	%xmm0, %xmm0
+	add	$64, %rsi
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	lea	16(%rax), %rax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %rsi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%rax), %rax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %rsi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%rax), %rax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	sub	$16, %rsi
+	jbe	L(return_start_len)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	lea	16(%rax), %rax
+	test	%edx, %edx
+	jnz	L(strnlen_exit)
+
+	mov	%r8, %rax
+	ret
+
+	.p2align 4
+L(strnlen_exit):
+	sub	%rcx, %rax
+
+	test	%dl, %dl
+	jz	L(strnlen_exit_high)
+	mov	%dl, %cl
+	and	$15, %cl
+	jz	L(strnlen_exit_8)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+	test	$0x02, %dl
+	jnz	L(strnlen_exit_tail1)
+	test	$0x04, %dl
+	jnz	L(strnlen_exit_tail2)
+	sub	$4, %rsi
+	jb	L(return_start_len)
+	lea	3(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_8):
+	test	$0x10, %dl
+	jnz	L(strnlen_exit_tail4)
+	test	$0x20, %dl
+	jnz	L(strnlen_exit_tail5)
+	test	$0x40, %dl
+	jnz	L(strnlen_exit_tail6)
+	sub	$8, %rsi
+	jb	L(return_start_len)
+	lea	7(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_high):
+	mov	%dh, %ch
+	and	$15, %ch
+	jz	L(strnlen_exit_high_8)
+	test	$0x01, %dh
+	jnz	L(strnlen_exit_tail8)
+	test	$0x02, %dh
+	jnz	L(strnlen_exit_tail9)
+	test	$0x04, %dh
+	jnz	L(strnlen_exit_tail10)
+	sub	$12, %rsi
+	jb	L(return_start_len)
+	lea	11(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_high_8):
+	test	$0x10, %dh
+	jnz	L(strnlen_exit_tail12)
+	test	$0x20, %dh
+	jnz	L(strnlen_exit_tail13)
+	test	$0x40, %dh
+	jnz	L(strnlen_exit_tail14)
+	sub	$16, %rsi
+	jb	L(return_start_len)
+	lea	15(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail1):
+	sub	$2, %rsi
+	jb	L(return_start_len)
+	lea	1(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail2):
+	sub	$3, %rsi
+	jb	L(return_start_len)
+	lea	2(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail4):
+	sub	$5, %rsi
+	jb	L(return_start_len)
+	lea	4(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail5):
+	sub	$6, %rsi
+	jb	L(return_start_len)
+	lea	5(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail6):
+	sub	$7, %rsi
+	jb	L(return_start_len)
+	lea	6(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail8):
+	sub	$9, %rsi
+	jb	L(return_start_len)
+	lea	8(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail9):
+	sub	$10, %rsi
+	jb	L(return_start_len)
+	lea	9(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail10):
+	sub	$11, %rsi
+	jb	L(return_start_len)
+	lea	10(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail12):
+	sub	$13, %rsi
+	jb	L(return_start_len)
+	lea	12(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail13):
+	sub	$14, %rsi
+	jb	L(return_start_len)
+	lea	13(%eax), %eax
+	ret
+
+	.p2align 4
+L(strnlen_exit_tail14):
+	sub	$15, %rsi
+	jb	L(return_start_len)
+	lea	14(%eax), %eax
+	ret
+
+	.p2align 4
+L(return_start_len):
+	mov	%r8, %rax
+	ret
+
+/* for prolog only */
+
+	.p2align 4
+L(len_less4_prolog):
+	add	$4, %rsi
+	jz	L(exit_tail0)
+
+	cmpb	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmp	$1, %esi
+	je	L(exit_tail1)
+
+	cmpb	$0, 1(%rdi)
+	jz	L(exit_tail1)
+	cmp	$2, %esi
+	je	L(exit_tail2)
+
+	cmpb	$0, 2(%rdi)
+	jz	L(exit_tail2)
+	cmp	$3, %esi
+	je	L(exit_tail3)
+
+	cmpb	$0, 3(%rdi)
+	jz	L(exit_tail3)
+	mov	$4, %eax
+	ret
+
+	.p2align 4
+L(len_less8_prolog):
+	add	$4, %rsi
+
+	cmpb	$0, 4(%rdi)
+	jz	L(exit_tail4)
+	cmp	$1, %esi
+	je	L(exit_tail5)
+
+	cmpb	$0, 5(%rdi)
+	jz	L(exit_tail5)
+	cmp	$2, %esi
+	je	L(exit_tail6)
+
+	cmpb	$0, 6(%rdi)
+	jz	L(exit_tail6)
+	cmp	$3, %esi
+	je	L(exit_tail7)
+
+	cmpb	$0, 7(%rdi)
+	jz	L(exit_tail7)
+	mov	$8, %eax
+	ret
+
+	.p2align 4
+L(len_less12_prolog):
+	add	$4, %rsi
+
+	cmpb	$0, 8(%rdi)
+	jz	L(exit_tail8)
+	cmp	$1, %esi
+	je	L(exit_tail9)
+
+	cmpb	$0, 9(%rdi)
+	jz	L(exit_tail9)
+	cmp	$2, %esi
+	je	L(exit_tail10)
+
+	cmpb	$0, 10(%rdi)
+	jz	L(exit_tail10)
+	cmp	$3, %esi
+	je	L(exit_tail11)
+
+	cmpb	$0, 11(%rdi)
+	jz	L(exit_tail11)
+	mov	$12, %eax
+	ret
+
+	.p2align 4
+L(len_less16_prolog):
+	add	$4, %rsi
+
+	cmpb	$0, 12(%rdi)
+	jz	L(exit_tail12)
+	cmp	$1, %esi
+	je	L(exit_tail13)
+
+	cmpb	$0, 13(%rdi)
+	jz	L(exit_tail13)
+	cmp	$2, %esi
+	je	L(exit_tail14)
+
+	cmpb	$0, 14(%rdi)
+	jz	L(exit_tail14)
+	cmp	$3, %esi
+	je	L(exit_tail15)
+
+	cmpb	$0, 15(%rdi)
+	jz	L(exit_tail15)
+	mov	$16, %eax
+	ret
+# endif
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail2):
+	add	$2, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail3):
+	add	$3, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail4):
+	add	$4, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail5):
+	add	$5, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail6):
+	add	$6, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail7):
+	add	$7, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail8):
+	add	$8, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail9):
+	add	$9, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail10):
+	add	$10, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail11):
+	add	$11, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail12):
+	add	$12, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail13):
+	add	$13, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail14):
+	add	$14, %eax
+	RETURN
+
+	.p2align 4
+L(exit_tail15):
+	add	$15, %eax
+# ifndef USE_AS_STRCAT
+	RETURN
+END (STRLEN)
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
similarity index 88%
copy from sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
copy to sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
index 6d9951e..cc4bb57 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
@@ -1,4 +1,4 @@
-/* strcat with SSE2
+/* strlen SSE2
    Copyright (C) 2011-2013 Free Software Foundation, Inc.
    Contributed by Intel Corporation.
    This file is part of the GNU C Library.
@@ -17,23 +17,18 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
+#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
 
-# include <sysdep.h>
+# ifndef USE_AS_STRCAT
 
-# ifndef STRCAT
-#  define STRCAT  __strcat_sse2_unaligned
-# endif
+#  include <sysdep.h>
 
-# define USE_AS_STRCAT
+#  define RETURN ret
 
-.text
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
+	.section .text.sse2,"ax",@progbits
+ENTRY (__strlen_sse2_pminub)
 
+# endif
 	xor	%rax, %rax
 	mov	%edi, %ecx
 	and	$0x3f, %ecx
@@ -58,7 +53,6 @@ L(next):
 	pmovmskb %xmm0, %edx
 	and	%r10d, %edx
 	jnz	L(exit)
-
 L(align16_start):
 	pxor	%xmm0, %xmm0
 	pxor	%xmm1, %xmm1
@@ -147,6 +141,7 @@ L(align16_start):
 	test	%edx, %edx
 	jnz	L(exit64)
 
+
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
@@ -197,6 +192,7 @@ L(align16_start):
 	test	%edx,	%edx
 	jz	L(align64_loop)
 
+
 	pcmpeqb	-64(%rax), %xmm0
 	sub	$80,	%rax
 	pmovmskb %xmm0, %edx
@@ -219,7 +215,7 @@ L(align16_start):
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
 	add	$64, %rax
-	jmp	L(StartStrcpyPart)
+	RETURN
 
 	.p2align 4
 L(exit):
@@ -227,50 +223,37 @@ L(exit):
 L(exit_less16):
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
+	RETURN
 	.p2align 4
 L(exit16):
 	sub	%rdi, %rax
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
 	add	$16, %rax
-	jmp	L(StartStrcpyPart)
-
+	RETURN
 	.p2align 4
 L(exit32):
 	sub	%rdi, %rax
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
 	add	$32, %rax
-	jmp	L(StartStrcpyPart)
-
+	RETURN
 	.p2align 4
 L(exit48):
 	sub	%rdi, %rax
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
 	add	$48, %rax
-	jmp	L(StartStrcpyPart)
-
+	RETURN
 	.p2align 4
 L(exit64):
 	sub	%rdi, %rax
 	bsf	%rdx, %rdx
 	add	%rdx, %rax
 	add	$64, %rax
+# ifndef USE_AS_STRCAT
+	RETURN
 
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
+END (__strlen_sse2_pminub)
 # endif
-
-# include "strcpy-sse2-unaligned.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000..8d685df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,84 @@
+/* strlen with SSE4
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+	pxor	%xmm1, %xmm1
+	movl	%edi, %ecx
+	movq	%rdi, %r8
+	andq	$~15, %rdi
+	xor	%edi, %ecx
+	pcmpeqb	(%rdi), %xmm1
+	pmovmskb %xmm1, %edx
+	shrl	%cl, %edx
+	shll	%cl, %edx
+	andl	%edx, %edx
+	jnz	L(less16bytes)
+	pxor	%xmm1, %xmm1
+
+	.p2align 4
+L(more64bytes_loop):
+	pcmpistri $0x08, 16(%rdi), %xmm1
+	jz	L(more32bytes)
+
+	pcmpistri $0x08, 32(%rdi), %xmm1
+	jz	L(more48bytes)
+
+	pcmpistri $0x08, 48(%rdi), %xmm1
+	jz	L(more64bytes)
+
+	add	$64, %rdi
+	pcmpistri $0x08, (%rdi), %xmm1
+	jnz	L(more64bytes_loop)
+	leaq	(%rdi,%rcx), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more32bytes):
+	leaq	16(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more48bytes):
+	leaq	32(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(more64bytes):
+	leaq	48(%rdi,%rcx, 1), %rax
+	subq	%r8, %rax
+	ret
+
+	.p2align 4
+L(less16bytes):
+	subq	%r8, %rdi
+	bsfl	%edx, %eax
+	addq	%rdi, %rax
+	ret
+
+END (__strlen_sse42)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
new file mode 100644
index 0000000..ab29cef
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -0,0 +1,68 @@
+/* Multiple versions of strlen(str) -- determine the length of the string STR.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc and for
+   the DSO.  In static binaries we need strlen before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strlen_sse2_pminub(%rip), %rax
+	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+	jnz	2f
+	leaq	__strlen_sse2(%rip), %rax
+	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	jz	2f
+	leaq	__strlen_sse42(%rip), %rax
+	ret
+2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	jz	3f
+	leaq    __strlen_sse2_no_bsf(%rip), %rax
+3:	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_sse2, @function; \
+	.align 16; \
+	.globl __strlen_sse2; \
+	.hidden __strlen_sse2; \
+	__strlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_sse2
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
new file mode 100644
index 0000000..248328d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNLEN
+#define STRLEN __strnlen_sse2_no_bsf
+#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
new file mode 100644
index 0000000..124f845
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.S
@@ -0,0 +1,57 @@
+/* multiple version of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2011-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strnlen_sse2(%rip), %rax
+	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	jz	2f
+	leaq	__strnlen_sse2_no_bsf(%rip), %rax
+2:	ret
+END(__strnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strnlen_sse2, @function; \
+	.align 16; \
+	.globl __strnlen_sse2; \
+	.hidden __strnlen_sse2; \
+	__strnlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
+
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index e82fe8d..4bdca0a 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,6 @@
-/* SSE2 version of strlen.
-   Copyright (C) 2012, 2013 Free Software Foundation, Inc.
+/* strlen(str) -- determine the length of the string STR.
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -18,217 +19,83 @@
 
 #include <sysdep.h>
 
-/* Long lived register are
-	strlen(s), strnlen(s, n):
 
-	%xmm11 - zero
-	%rdi   - s
-	%r10  (s+n)	& (~(64-1))
-	%r11   s+n
-*/
-
-
-.text
+	.text
 ENTRY(strlen)
-
-#define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
-	test	%rsi, %rsi
-	jne	L(n_nonzero)
 	xor	%rax, %rax
-	ret
-L(n_nonzero):
-
-/* Initialize long lived registers.  */
-
-	add	%rdi, %rsi
-	mov	%rsi, %r10
-	and	$-64, %r10
-	mov	%rsi, %r11
-#endif
-
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
 	ja	L(next)
-
-#ifdef AS_STRNLEN
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG  andq $-64, %rax;
-#endif
-
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
-	ret
-
-#ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
 	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsfq	%rdx, %rax
-	ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9;
-	pcmpeqb 32(%rax), %xmm10;
-	pcmpeqb 48(%rax), %xmm11;
-	pmovmskb	%xmm9, %edx;
-	pmovmskb	%xmm10, %r8d;
-	pmovmskb	%xmm11, %ecx;
-	salq	$16, %rdx;
-	salq	$16, %rcx;
-	orq	%r8, %rcx;
-	salq	$32, %rcx;
-	orq	%rcx, %rdx;
-#endif
-
-	PROLOG(loop)
-
-	.p2align 4
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
 L(next):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
-	sarq	%cl, %rdx
-	test	%rdx, %rdx
-	je	L(loop_init)
-	bsfq	%rdx, %rax
-	ret
-#endif
-	.p2align 4
-L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
-#ifdef AS_STRNLEN
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %esi
+	sub	%rax, %rcx
+	shl	%cl, %esi
+	pmovmskb %xmm0, %edx
+	and	%esi, %edx
+	jnz	L(exit)
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 	.p2align 4
-L(loop):
-
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
+L(align16_loop):
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
-	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
 
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first)
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
 
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	lea	64(%rax), %rax
+	test	%edx, %edx
+	jz	L(align16_loop)
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
 	ret
-
 	.p2align 4
-L(exit):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	lea	16(%rdx,%rax), %rax
 	ret
-
-#else
 	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
-	testl	%edx, %edx
-	jne	L(exit64)
-
-	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
-	testl	%edx, %edx
-	jne	L(exit0)
-	jmp	L(loop)
-
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	lea	32(%rdx,%rax), %rax
+	ret
 	.p2align 4
-L(exit64):
-	addq	$64, %rax
-L(exit0):
-	pxor	%xmm8, %xmm8
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	lea	48(%rdx,%rax), %rax
 	ret
-
-#endif
-
 END(strlen)
-#ifndef AS_STRLEN
 libc_hidden_builtin_def (strlen)
-#endif
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index d3c43ac..6e53503 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,6 +1,63 @@
-#define AS_STRNLEN
-#define strlen __strnlen
-#include "strlen.S"
+/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   Contributed by Ulrich Drepper <drepper@redhat.com>.
+   This file is part of the GNU C Library.
 
-weak_alias (__strnlen, strnlen);
-libc_hidden_builtin_def (strnlen)
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+	.text
+ENTRY(__strnlen)
+	movq	%rsi, %rax
+	testq	%rsi, %rsi
+	jz	3f
+	pxor	%xmm2, %xmm2
+	movq	%rdi, %rcx
+	movq	%rdi, %r8
+	movq	$16, %r9
+	andq	$~15, %rdi
+	movdqa	%xmm2, %xmm1
+	pcmpeqb	(%rdi), %xmm2
+	orl	$0xffffffff, %r10d
+	subq	%rdi, %rcx
+	shll	%cl, %r10d
+	subq	%rcx, %r9
+	pmovmskb %xmm2, %edx
+	andl	%r10d, %edx
+	jnz	1f
+	subq	%r9, %rsi
+	jbe	3f
+
+2:	movdqa	16(%rdi), %xmm0
+	leaq	16(%rdi), %rdi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	testl	%edx, %edx
+	jnz	1f
+	subq	$16, %rsi
+	jnbe	2b
+3:	ret
+
+1:	subq	%r8, %rdi
+	bsfl	%edx, %edx
+	addq	%rdi, %rdx
+	cmpq	%rdx, %rax
+	cmovnbq	%rdx, %rax
+	ret
+END(__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                        |   23 -
 sysdeps/x86_64/multiarch/Makefile                |    6 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |   13 +
 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S |  229 +-------
 sysdeps/x86_64/multiarch/strcat-ssse3.S          |  312 +----------
 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S    |  685 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S    |  259 ++++++++
 sysdeps/x86_64/multiarch/strlen-sse4.S           |   84 +++
 sysdeps/x86_64/multiarch/strlen.S                |   68 +++
 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S   |    3 +
 sysdeps/x86_64/multiarch/strnlen.S               |   57 ++
 sysdeps/x86_64/strlen.S                          |  263 ++-------
 sysdeps/x86_64/strnlen.S                         |   67 ++-
 13 files changed, 1306 insertions(+), 763 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]