This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v2] faster strlen on x64
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 5 Nov 2012 15:35:14 +0100
- Subject: [PATCH v2] faster strlen on x64
Hello,
I revised strlen implementation based on profiling results [1].
As for strlen most of time it suffices to inspect only first 4
16-byte blocks I optimized header accordingly.
[1] http://kam.mff.cuni.cz/~ondra/benchmark_string/profile/result.html
There I used this version of strlen.
I decided not to use ifuncs as I do not know how to make this faster on atom.
I could gain something for 48-64 byte strings by unaligned loads.
It could be slower in a case when first cache line is in cache but
second is not and strlen does not have to use second cache line.
I also made it usable in rtld because x64 calling conventions
uses only xmm0-xmm7 as arguments and xmm are call clobered
then xmm8-15 are available.
As strnlen is quite similar I unified their implementations.
It will be used in strcat which I splited into separate patch.
2012-11-05 Ondrej Bilka <neleai@seznam.cz>
* sysdeps/x86_64/strlen.S: Added new implementation.
* sysdeps/x86_64/strnlen.S: Use sysdeps/x86_64/strlen.S.
* sysdeps/x86_64/rtld-strlen.S: Use sysdeps/x86_64/strlen.S.
* sysdeps/x86_64/multiarch/strlen.S: No longer needed.
* sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: No longer needed.
* sysdeps/x86_64/multiarch/strnlen.S: No longer needed.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Delete unused ifuncs.
* sysdeps/x86_64/multiarch/Makefile: Updated.
---
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 -
sysdeps/x86_64/multiarch/strlen.S | 68 ------
sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 -
sysdeps/x86_64/multiarch/strnlen.S | 57 ------
sysdeps/x86_64/rtld-strlen.S | 139 +-------------
sysdeps/x86_64/strlen.S | 258 ++++++++++++++++--------
sysdeps/x86_64/strnlen.S | 69 +------
8 files changed, 185 insertions(+), 414 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strlen.S
delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..6b07afa 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
- strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
+ strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 332a60d..55896c3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -188,9 +188,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.S. */
- IFUNC_IMPL (i, name, strnlen,
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf)
- IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/strpbrk.S. */
IFUNC_IMPL (i, name, strpbrk,
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
deleted file mode 100644
index f93432e..0000000
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Multiple versions of strlen(str) -- determine the length of the string STR.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2012 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
- the DSO. In static binaries we need strlen before the initialization
- happened. */
-#if defined SHARED && !defined NOT_IN_libc
- .text
-ENTRY(strlen)
- .type strlen, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strlen_sse2_pminub(%rip), %rax
- testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
- jnz 2f
- leaq __strlen_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strlen_sse42(%rip), %rax
- ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strlen_sse2_no_bsf(%rip), %rax
-3: ret
-END(strlen)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strlen_sse2, @function; \
- .align 16; \
- .globl __strlen_sse2; \
- .hidden __strlen_sse2; \
- __strlen_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strlen calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strlen; __GI_strlen = __strlen_sse2
-#endif
-
-#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
deleted file mode 100644
index 248328d..0000000
--- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNLEN
-#define STRLEN __strnlen_sse2_no_bsf
-#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
deleted file mode 100644
index 4df05fc..0000000
--- a/sysdeps/x86_64/multiarch/strnlen.S
+++ /dev/null
@@ -1,57 +0,0 @@
-/* multiple version of strnlen
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2011-2012 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
-#ifndef NOT_IN_libc
-
- .text
-ENTRY(__strnlen)
- .type __strnlen, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strnlen_sse2(%rip), %rax
- testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 2f
- leaq __strnlen_sse2_no_bsf(%rip), %rax
-2: ret
-END(__strnlen)
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strnlen_sse2, @function; \
- .align 16; \
- .globl __strnlen_sse2; \
- .hidden __strnlen_sse2; \
- __strnlen_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
-
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
- .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
-#endif
-
-#include "../strnlen.S"
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
index 7293f87..71be00b 100644
--- a/sysdeps/x86_64/rtld-strlen.S
+++ b/sysdeps/x86_64/rtld-strlen.S
@@ -1,138 +1 @@
-/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2002, 2003 Free Software Foundation, Inc.
- Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-#include "bp-sym.h"
-#include "bp-asm.h"
-
-
- .text
-ENTRY (strlen)
- movq %rdi, %rcx /* Duplicate source pointer. */
- andl $7, %ecx /* mask alignment bits */
- movq %rdi, %rax /* duplicate destination. */
- jz 1f /* aligned => start loop */
-
- neg %ecx /* We need to align to 8 bytes. */
- addl $8,%ecx
- /* Search the first bytes directly. */
-0: cmpb $0x0,(%rax) /* is byte NUL? */
- je 2f /* yes => return */
- incq %rax /* increment pointer */
- decl %ecx
- jnz 0b
-
-1: movq $0xfefefefefefefeff,%r8 /* Save magic. */
-
- .p2align 4 /* Align loop. */
-4: /* Main Loop is unrolled 4 times. */
- /* First unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Second unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Third unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Fourth unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jz 4b /* no NUL found => continue loop */
-
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0x00ff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- testl $0xff000000, %ecx /* is fourth byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- shrq $32, %rcx /* look at other half. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0xff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-2:
- subq %rdi, %rax /* compute difference to string start */
- ret
-END (strlen)
-libc_hidden_builtin_def (strlen)
+#include "sysdeps/x86_64/strlen.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f83d857..65bb849 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,7 +1,4 @@
-/* strlen(str) -- determine the length of the string STR.
- Copyright (C) 2009, 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
+/* strlen(str) -- determine the length of the string STR.
Copyright (C) 2012 Free Software Foundation, Inc.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -17,85 +14,184 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include <sysdep.h>
+/* This implementation must avoid xmm0-xmm7 to be also usable in dynamic linker. */
- .text
+.text
ENTRY(strlen)
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %esi
- sub %rax, %rcx
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- and %esi, %edx
- jnz L(exit)
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- .p2align 4
-L(align16_loop):
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
-
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- lea 64(%rax), %rax
- test %edx, %edx
- jz L(align16_loop)
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- ret
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 16(%rdx,%rax), %rax
- ret
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 32(%rdx,%rax), %rax
- ret
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 48(%rdx,%rax), %rax
- ret
+
+#ifdef AS_STRCAT
+#define RETURN jmp .cpy_str
+#elif defined(AS_STRNCAT)
+#define RETURN jmp .cpy_str
+ mov %rdx,%r9
+#else
+#define RETURN ret
+#endif
+
+#ifdef AS_STRNLEN
+ test %rsi,%rsi
+ jne L(zero_n)
+ xor %rax,%rax
+ RETURN
+ L(zero_n):
+
+#define STRNLEN_PROLOG(lab)\
+ mov %rsi,%r8;\
+ andq $-64,%rax;\
+ addq %rdi,%rsi;\
+ subq %rax,%rsi;\
+ addq %rcx,%r8;\
+ testq %r9,%r8;\
+ jne L(lab) ;\
+ bts %r8,%rdx;\
+L(lab):
+#else
+#define STRNLEN_PROLOG(lab) andq $-64,%rax
+#endif
+
+
+#define FIND_ZERO \
+ pcmpeqb (%rax),%xmm8 ;\
+ pcmpeqb 16(%rax),%xmm9;\
+ pmovmskb %xmm8, %r8d;\
+ pcmpeqb 32(%rax),%xmm10;\
+ pmovmskb %xmm9, %edx;\
+ pcmpeqb 48(%rax),%xmm11;\
+ salq $16, %rdx;\
+ pmovmskb %xmm10, %r10d;\
+ pmovmskb %xmm11, %ecx;\
+ salq $16, %rcx;\
+ orq %r8, %rdx;\
+ orq %r10, %rcx;\
+ salq $32, %rcx;\
+ orq %rcx, %rdx;
+
+
+#define PROLOG(lab) \
+ FIND_ZERO;\
+ movq %rdi, %rcx;\
+ xorq %rax,%rcx;\
+ STRNLEN_PROLOG(lab);\
+ sarq %cl,%rdx;\
+ test %rdx, %rdx;
+
+
+ /* When profiling gcc 94% of calls are resolved by this header. */
+ /* It could be profitable to save space and share tails of
+ strlen and strnlen.*/
+ movq %rdi, %rax
+ pxor %xmm8,%xmm8
+ movq %rdi, %r8
+ pxor %xmm9,%xmm9
+ /* Check if we crossed a page. This happens with probability at most 1/32. */
+ andq $4095, %r8
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+#ifdef AS_STRNLEN
+ movq $-64, %r9
+#endif
+ cmpq $4032, %r8
+#ifndef LINE_ALIGNED_START
+ /* We cannot unify this branch with L(next) as it is ~6 cycles slower. */
+ ja L(next)
+ andq $-16,%rax
+ PROLOG(fall1)
+ je L(loop_start)
+ bsfq %rdx, %rax
+ RETURN
+L(next):
+#endif
+
+ andq $-64,%rax
+ PROLOG(fall2)
+ pxor %xmm11,%xmm11
+ je L(loop_start)
+ bsfq %rdx, %rax
+ RETURN
+
+L(loop_end64):
+ addq $64, %rax
+L(loop_end0):
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ pxor %xmm10,%xmm10
+ pxor %xmm11,%xmm11
+#ifdef AS_STRNLEN
+ testq %rsi,%rsi
+ jne L(fall3)
+ subq %rdi,%rax
+ RETURN
+L(fall3):
+ FIND_ZERO
+ testq %r9,%rsi
+ jne L(fall4)
+ bts %rsi,%rdx
+L(fall4):
+#else
+ FIND_ZERO
+#endif
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ RETURN
+
+L(loop_start):
+ #ifdef AS_STRNLEN
+ addq %r9,%rsi
+ testq %r9,%rsi
+ je L(loop_end64)
+ #endif
+ /* Prefetching helps when data are in L2 cache and later
+ but I do not know how to prefetch without harming performance
+ of short strings */
+ #prefetcht0 576(%rax)
+ /* It is possible to compute pminub 64(%rax), 80(%rax) and
+ pminub 96(%rax), 112(%rax) in parallel. However no processor
+ upto sandy bridge exploits it so this. Thus our implementation
+ saves one movdqa instruction.
+ */
+ movdqa 64(%rax), %xmm8
+ pminub 80(%rax), %xmm8
+ pminub 96(%rax), %xmm8
+ pminub 112(%rax), %xmm8
+ pcmpeqb %xmm11, %xmm8
+ pmovmskb %xmm8, %edx
+ testl %edx, %edx
+ jne L(loop_end64)
+ subq $-128, %rax
+ #ifdef AS_STRNLEN
+ addq %r9,%rsi
+ testq %r9,%rsi
+ je L(loop_end0)
+ #endif
+ #prefetcht0 512(%rax)
+ movdqa (%rax), %xmm8
+ pminub 16(%rax), %xmm8
+ pminub 32(%rax), %xmm8
+ pminub 48(%rax), %xmm8
+ pcmpeqb %xmm11, %xmm8
+ pmovmskb %xmm8, %edx
+ testl %edx, %edx
+ je L(loop_start)
+ jmp L(loop_end0)
END(strlen)
libc_hidden_builtin_def (strlen)
+weak_alias(strlen,__strlen_sse2)
+
+#ifdef INLINE_STRLEN
+ENTRY(__strlen_inline)
+ mov %rdi,%rax
+ pxor %xmm8,%xmm8
+ andq $-16,%rax
+ pcmpeqb (%rax),%xmm8
+ pmovmskb %xmm8,%rdx
+ shl %rdi,%dl
+ test %edx,%edx
+ je L(strlen)
+ bsf %edx,%eax
+ ret
+END(__strlen_inline)
+#endif
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index 7b38bf4..dc393a4 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,63 +1,6 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
- Copyright (C) 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-
- .text
-ENTRY(__strnlen)
- movq %rsi, %rax
- testq %rsi, %rsi
- jz 3f
- pxor %xmm2, %xmm2
- movq %rdi, %rcx
- movq %rdi, %r8
- movq $16, %r9
- andq $~15, %rdi
- movdqa %xmm2, %xmm1
- pcmpeqb (%rdi), %xmm2
- orl $0xffffffff, %r10d
- subq %rdi, %rcx
- shll %cl, %r10d
- subq %rcx, %r9
- pmovmskb %xmm2, %edx
- andl %r10d, %edx
- jnz 1f
- subq %r9, %rsi
- jbe 3f
-
-2: movdqa 16(%rdi), %xmm0
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jnz 1f
- subq $16, %rsi
- jnbe 2b
-3: ret
-
-1: subq %r8, %rdi
- bsfl %edx, %edx
- addq %rdi, %rdx
- cmpq %rdx, %rax
- cmovnbq %rdx, %rax
- ret
-END(__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
+#define AS_STRNLEN
+#define strlen __strnlen
+#define __strlen_sse2 __strnlen_sse2
+#include "sysdeps/x86_64/strlen.S"
+weak_alias(__strnlen,strnlen);
+weak_alias(__strnlen,__GI_strnlen);
--
1.7.4.4