This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 1/2] Fix strrchr regression.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 5 Aug 2013 19:33:46 +0200
- Subject: [PATCH 1/2] Fix strrchr regression.
Hi,
I intentionally waited with this patch to check if there is somebody
else who reads benchmark results.
I knew for two years that using SSE4.2 for strrchr causes performance regression
on all architectures that support it. I also checked that it is present
in benchmark outputs on nehalem but nobody noticed.
Also nobody noticed that results do not make lot of sense. It is quite
weird that byte-by byte implementation is slower for 32 bytes than when
it checks 2048 bytes, isn't it?
simple_strrchr __strrchr_sse42 __strrchr_sse2_no_bsf __strrchr_sse2
Length 32, alignment in bytes 0: 3929 353 320 341
Length 32, alignment in bytes 1: 3932 353 293 341
Length 64, alignment in bytes 0: 3923 359 299 341
Length 64, alignment in bytes 2: 3929 359 293 341
Length 128, alignment in bytes 0: 3929 359 296 341
Length 128, alignment in bytes 3: 3917 356 287 338
Length 256, alignment in bytes 0: 3929 405 308 341
Length 256, alignment in bytes 4: 3932 365 296 344
Length 512, alignment in bytes 0: 3929 378 305 341
Length 512, alignment in bytes 5: 3929 399 308 344
Length 1024, alignment in bytes 0: 3932 420 305 344
Length 1024, alignment in bytes 6: 3757 396 302 362
Length 2048, alignment in bytes 0: 3633 362 320 338
Length 2048, alignment in bytes 7: 3633 356 314 338
To get reliable results I added strrchr to my profiler. You can clearly
see asymptotic behavior of strrchr here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html
Actually there are two regressions here. First one is that strchr_sse42
is also about 14% asymptotically slower in sandy/ivy bridge and around
3% slower for fx10.
Second is more serious, on all architectures that I tested an
__strrchr_sse2_no_bsf came asymptotically faster than __strrchr_sse2 and
consecutively __strrchr_sse42.
Given these regressions it is viable to remove __strrchr_sse42 and __strrchr_sse2
and use __strrchr_sse2_no_bsf as base implementation.
We can accept this by same reasoning that lead us to choose
__strrchr_sse2 over _strrchr_sse42 for silvermont architecture.
I will send second part that optimizes strrchr implementation later.
---
sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 ------------------------
sysdeps/x86_64/multiarch/strrchr.S | 288 ------------
sysdeps/x86_64/strrchr.S | 579 ++++++++++++++++++++++---
3 files changed, 527 insertions(+), 895 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index fcef610..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
- Copyright (C) 2011-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
- atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
- movd %rsi, %xmm1
- pxor %xmm2, %xmm2
- mov %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
- /* ECX has OFFSET. */
- and $63, %rcx
- cmp $48, %rcx
- pshufd $0, %xmm1, %xmm1
- ja L(crosscache)
-
-/* unaligned string. */
- movdqu (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %rcx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
-/* Hancle unaligned string. */
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm3, %rdx
- /* Check if there is a match. */
- pmovmskb %xmm0, %rax
- /* Remove the leading bytes. */
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm0, %xmm2
- add $16, %rdi
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jz L(loop)
-
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
- jmp L(match_exit)
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test %cl, %cl
- jz L(find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(find_zero_8)
- test $0x01, %cl
- jnz L(FindZeroExit1)
- test $0x02, %cl
- jnz L(FindZeroExit2)
- test $0x04, %cl
- jnz L(FindZeroExit3)
- and $1 << 4 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_8):
- test $0x10, %cl
- jnz L(FindZeroExit5)
- test $0x20, %cl
- jnz L(FindZeroExit6)
- test $0x40, %cl
- jnz L(FindZeroExit7)
- and $1 << 8 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(find_zero_high_8)
- test $0x01, %ch
- jnz L(FindZeroExit9)
- test $0x02, %ch
- jnz L(FindZeroExit10)
- test $0x04, %ch
- jnz L(FindZeroExit11)
- and $1 << 12 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(find_zero_high_8):
- test $0x10, %ch
- jnz L(FindZeroExit13)
- test $0x20, %ch
- jnz L(FindZeroExit14)
- test $0x40, %ch
- jnz L(FindZeroExit15)
- and $1 << 16 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit1):
- and $1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit2):
- and $1 << 2 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit3):
- and $1 << 3 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit5):
- and $1 << 5 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit6):
- and $1 << 6 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit7):
- and $1 << 7 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit9):
- and $1 << 9 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit10):
- and $1 << 10 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit11):
- and $1 << 11 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit13):
- and $1 << 13 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit14):
- and $1 << 14 - 1, %rax
- jz L(return_value)
- jmp L(match_exit)
-
- .p2align 4
-L(FindZeroExit15):
- and $1 << 15 - 1, %rax
- jz L(return_value)
-
- .p2align 4
-L(match_exit):
- test %ah, %ah
- jnz L(match_exit_high)
- mov %al, %dl
- and $15 << 4, %dl
- jnz L(match_exit_8)
- test $0x08, %al
- jnz L(Exit4)
- test $0x04, %al
- jnz L(Exit3)
- test $0x02, %al
- jnz L(Exit2)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_8):
- test $0x80, %al
- jnz L(Exit8)
- test $0x40, %al
- jnz L(Exit7)
- test $0x20, %al
- jnz L(Exit6)
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high):
- mov %ah, %dh
- and $15 << 4, %dh
- jnz L(match_exit_high_8)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x02, %ah
- jnz L(Exit10)
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_exit_high_8):
- test $0x80, %ah
- jnz L(Exit16)
- test $0x40, %ah
- jnz L(Exit15)
- test $0x20, %ah
- jnz L(Exit14)
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit2):
- lea -15(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit3):
- lea -14(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit4):
- lea -13(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit6):
- lea -11(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit7):
- lea -10(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit8):
- lea -9(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit10):
- lea -7(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit11):
- lea -6(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit12):
- lea -5(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit14):
- lea -3(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit15):
- lea -2(%rdi), %rax
- ret
-
- .p2align 4
-L(Exit16):
- lea -1(%rdi), %rax
- ret
-
-/* Return NULL. */
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test %cl, %cl
- jz L(prolog_find_zero_high)
- mov %cl, %dl
- and $15, %dl
- jz L(prolog_find_zero_8)
- test $0x01, %cl
- jnz L(PrologFindZeroExit1)
- test $0x02, %cl
- jnz L(PrologFindZeroExit2)
- test $0x04, %cl
- jnz L(PrologFindZeroExit3)
- and $1 << 4 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_8):
- test $0x10, %cl
- jnz L(PrologFindZeroExit5)
- test $0x20, %cl
- jnz L(PrologFindZeroExit6)
- test $0x40, %cl
- jnz L(PrologFindZeroExit7)
- and $1 << 8 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high):
- mov %ch, %dh
- and $15, %dh
- jz L(prolog_find_zero_high_8)
- test $0x01, %ch
- jnz L(PrologFindZeroExit9)
- test $0x02, %ch
- jnz L(PrologFindZeroExit10)
- test $0x04, %ch
- jnz L(PrologFindZeroExit11)
- and $1 << 12 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_high_8):
- test $0x10, %ch
- jnz L(PrologFindZeroExit13)
- test $0x20, %ch
- jnz L(PrologFindZeroExit14)
- test $0x40, %ch
- jnz L(PrologFindZeroExit15)
- and $1 << 16 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit1):
- and $1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit2):
- and $1 << 2 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit3):
- and $1 << 3 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit5):
- and $1 << 5 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit6):
- and $1 << 6 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit7):
- and $1 << 7 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit9):
- and $1 << 9 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit10):
- and $1 << 10 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit11):
- and $1 << 11 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit13):
- and $1 << 13 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit14):
- and $1 << 14 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
- .p2align 4
-L(PrologFindZeroExit15):
- and $1 << 15 - 1, %rax
- jnz L(match_exit)
- xor %rax, %rax
- ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index 3f92a41..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* Multiple versions of strrchr
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
- the DSO. In static binaries we need strrchr before the initialization
- happened. */
-#if defined SHARED && !defined NOT_IN_libc
- .text
-ENTRY(strrchr)
- .type strrchr, @gnu_indirect_function
- cmpl $0, __cpu_features+KIND_OFFSET(%rip)
- jne 1f
- call __init_cpu_features
-1: leaq __strrchr_sse2(%rip), %rax
- testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
- jnz 2f
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strrchr_sse42(%rip), %rax
- ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strrchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strrchr)
-
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the last occurrence of the character c in the
- string s:
-
- char *strrchr (const char *s, int c);
-
- We use 0x4a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_MOST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- last offset. There are 4 cases:
-
- 1. The first 16byte data element has EOS and has the byte C at the
- last offset X.
- 2. The first 16byte data element is valid and has the byte C at the
- last offset X.
- 3. The first 16byte data element has EOS and doesn't have the byte C.
- 4. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 1 0
- 2 X 1 0 0
- 3 16 0 1 0
- 4 16 0 0 0
-
- We exit from the loop for cases 1 and 3 with jz which branches
- when ZFlag is 1. If CFlag == 1, ECX has the offset X for case 1. */
-
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strrchr_sse42, @function
- .globl __strrchr_sse42
- .hidden __strrchr_sse42
-__strrchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- xor %eax,%eax /* RAX has the last occurrence of s. */
- movd %esi, %xmm1
- punpcklbw %xmm1, %xmm1
- movl %edi, %esi
- punpcklbw %xmm1, %xmm1
- andl $15, %esi
- pshufd $0, %xmm1, %xmm1
- movq %rdi, %r8
- je L(loop)
-
-/* Handle unaligned string using psrldq. */
- leaq L(psrldq_table)(%rip), %rdx
- andq $-16, %r8
- movslq (%rdx,%rsi,4),%r9
- movdqa (%r8), %xmm0
- addq %rdx, %r9
- jmp *%r9
-
-/* Handle unaligned string with offset 1 using psrldq. */
- .p2align 4
-L(psrldq_1):
- psrldq $1, %xmm0
-
- .p2align 4
-L(unaligned_pcmpistri):
- pcmpistri $0x4a, %xmm1, %xmm0
- jnc L(unaligned_no_byte)
- leaq (%rdi,%rcx), %rax
-L(unaligned_no_byte):
- /* Find the length of the unaligned string. */
- pcmpistri $0x3a, %xmm0, %xmm0
- movl $16, %edx
- subl %esi, %edx
- cmpl %ecx, %edx
- /* Return RAX if the unaligned fragment to next 16B already
- contain the NULL terminator. */
- jg L(exit)
- addq $16, %r8
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- pcmpistri $0x4a, (%r8), %xmm1
- jbe L(match_or_eos)
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(match_or_eos):
- je L(had_eos)
-L(match_no_eos):
- leaq (%r8,%rcx), %rax
- addq $16, %r8
- jmp L(loop)
- .p2align 4
-L(had_eos):
- jnc L(exit)
- leaq (%r8,%rcx), %rax
- .p2align 4
-L(exit):
- ret
-
-/* Handle unaligned string with offset 15 using psrldq. */
- .p2align 4
-L(psrldq_15):
- psrldq $15, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq. */
- .p2align 4
-L(psrldq_14):
- psrldq $14, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq. */
- .p2align 4
-L(psrldq_13):
- psrldq $13, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq. */
- .p2align 4
-L(psrldq_12):
- psrldq $12, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq. */
- .p2align 4
-L(psrldq_11):
- psrldq $11, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq. */
- .p2align 4
-L(psrldq_10):
- psrldq $10, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq. */
- .p2align 4
-L(psrldq_9):
- psrldq $9, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq. */
- .p2align 4
-L(psrldq_8):
- psrldq $8, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq. */
- .p2align 4
-L(psrldq_7):
- psrldq $7, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq. */
- .p2align 4
-L(psrldq_6):
- psrldq $6, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq. */
- .p2align 4
-L(psrldq_5):
- psrldq $5, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq. */
- .p2align 4
-L(psrldq_4):
- psrldq $4, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq. */
- .p2align 4
-L(psrldq_3):
- psrldq $3, %xmm0
- jmp L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq. */
- .p2align 4
-L(psrldq_2):
- psrldq $2, %xmm0
- jmp L(unaligned_pcmpistri)
-
- cfi_endproc
- .size __strrchr_sse42, .-__strrchr_sse42
-
- .section .rodata.sse4.2,"a",@progbits
- .p2align 4
-L(psrldq_table):
- .int L(loop) - L(psrldq_table)
- .int L(psrldq_1) - L(psrldq_table)
- .int L(psrldq_2) - L(psrldq_table)
- .int L(psrldq_3) - L(psrldq_table)
- .int L(psrldq_4) - L(psrldq_table)
- .int L(psrldq_5) - L(psrldq_table)
- .int L(psrldq_6) - L(psrldq_table)
- .int L(psrldq_7) - L(psrldq_table)
- .int L(psrldq_8) - L(psrldq_table)
- .int L(psrldq_9) - L(psrldq_table)
- .int L(psrldq_10) - L(psrldq_table)
- .int L(psrldq_11) - L(psrldq_table)
- .int L(psrldq_12) - L(psrldq_table)
- .int L(psrldq_13) - L(psrldq_table)
- .int L(psrldq_14) - L(psrldq_table)
- .int L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strrchr_sse2, @function; \
- .align 16; \
- .globl __strrchr_sse2; \
- .hidden __strrchr_sse2; \
- __strrchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
- The speedup we get from using SSE4.2 instruction is likely eaten away
- by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
- .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index e413b07..3bde291 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -1,6 +1,6 @@
-/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
- For AMD x86-64.
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
+/* strrchr with SSE2 without bsf and bsr
+ Copyright (C) 2011-2013 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -17,63 +17,538 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include <sysdep.h>
+#include "asm-syntax.h"
- .text
ENTRY (strrchr)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
+
+ movd %rsi, %xmm1
pxor %xmm2, %xmm2
+ mov %rdi, %rcx
punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %rcx
+ cmp $48, %rcx
pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %rcx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %rax
+ add $16, %rdi
+
+ test %rax, %rax
+ jnz L(unaligned_match1)
+
+ test %rcx, %rcx
+ jnz L(return_null)
+
+ and $-16, %rdi
+ xor %r8, %r8
+ jmp L(loop)
+
+ .p2align 4
+L(unaligned_match1):
+ test %rcx, %rcx
+ jnz L(prolog_find_zero_1)
+
+ mov %rax, %r8
+ mov %rdi, %rsi
+ and $-16, %rdi
+ jmp L(loop)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %rcx
+ and $-16, %rdi
+ pxor %xmm3, %xmm3
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %rdx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %rax
+ /* Remove the leading bytes. */
+ shr %cl, %rdx
+ shr %cl, %rax
+ add $16, %rdi
+
+ test %rax, %rax
+ jnz L(unaligned_match)
+
+ test %rdx, %rdx
+ jnz L(return_null)
+
+ xor %r8, %r8
+ jmp L(loop)
+
+ .p2align 4
+L(unaligned_match):
+ test %rdx, %rdx
+ jnz L(prolog_find_zero)
+
+ mov %rax, %r8
+ lea (%rdi, %rcx), %rsi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %rdi
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- andl %esi, %edx
- andl %esi, %ecx
- xorl %eax, %eax
- movl %edx, %esi
- orl %ecx, %esi
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %rdi
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- movl %edx, %esi
- orl %ecx, %esi
- jz 2b
-
-1: bsfl %ecx, %r9d
- movl $0xffffffff, %r8d
- movl $31, %ecx
- jnz 5f
-
- bsrl %edx, %edx
- jz 2b
- leaq -16(%rdi,%rdx), %rax
- jmp 2b
-
-5: subl %r9d, %ecx
- shrl %cl, %r8d
- andl %r8d, %edx
- bsrl %edx, %edx
- jz 4f
- leaq -16(%rdi,%rdx), %rax
-4: ret
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ or %rax, %rcx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %rcx
+ pmovmskb %xmm0, %rax
+ or %rax, %rcx
+ jz L(loop)
+
+L(matches):
+ test %rax, %rax
+ jnz L(match)
+L(return_value):
+ test %r8, %r8
+ jz L(return_null)
+ mov %r8, %rax
+ mov %rsi, %rdi
+ jmp L(match_exit)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %rcx
+ test %rcx, %rcx
+ jnz L(find_zero)
+ mov %rax, %r8
+ mov %rdi, %rsi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(find_zero_8)
+ test $0x01, %cl
+ jnz L(FindZeroExit1)
+ test $0x02, %cl
+ jnz L(FindZeroExit2)
+ test $0x04, %cl
+ jnz L(FindZeroExit3)
+ and $1 << 4 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(find_zero_8):
+ test $0x10, %cl
+ jnz L(FindZeroExit5)
+ test $0x20, %cl
+ jnz L(FindZeroExit6)
+ test $0x40, %cl
+ jnz L(FindZeroExit7)
+ and $1 << 8 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(find_zero_high_8)
+ test $0x01, %ch
+ jnz L(FindZeroExit9)
+ test $0x02, %ch
+ jnz L(FindZeroExit10)
+ test $0x04, %ch
+ jnz L(FindZeroExit11)
+ and $1 << 12 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(find_zero_high_8):
+ test $0x10, %ch
+ jnz L(FindZeroExit13)
+ test $0x20, %ch
+ jnz L(FindZeroExit14)
+ test $0x40, %ch
+ jnz L(FindZeroExit15)
+ and $1 << 16 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit1):
+ and $1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit2):
+ and $1 << 2 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit3):
+ and $1 << 3 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit5):
+ and $1 << 5 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit6):
+ and $1 << 6 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit7):
+ and $1 << 7 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit9):
+ and $1 << 9 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit10):
+ and $1 << 10 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit11):
+ and $1 << 11 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit13):
+ and $1 << 13 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit14):
+ and $1 << 14 - 1, %rax
+ jz L(return_value)
+ jmp L(match_exit)
+
+ .p2align 4
+L(FindZeroExit15):
+ and $1 << 15 - 1, %rax
+ jz L(return_value)
+
+ .p2align 4
+L(match_exit):
+ test %ah, %ah
+ jnz L(match_exit_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(match_exit_8)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x02, %al
+ jnz L(Exit2)
+ lea -16(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_exit_8):
+ test $0x80, %al
+ jnz L(Exit8)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x20, %al
+ jnz L(Exit6)
+ lea -12(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_exit_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(match_exit_high_8)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x02, %ah
+ jnz L(Exit10)
+ lea -8(%rdi), %rax
+ ret
+
+ .p2align 4
+L(match_exit_high_8):
+ test $0x80, %ah
+ jnz L(Exit16)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x20, %ah
+ jnz L(Exit14)
+ lea -4(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit2):
+ lea -15(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit3):
+ lea -14(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit4):
+ lea -13(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit6):
+ lea -11(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit7):
+ lea -10(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit8):
+ lea -9(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit10):
+ lea -7(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit11):
+ lea -6(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit12):
+ lea -5(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit14):
+ lea -3(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit15):
+ lea -2(%rdi), %rax
+ ret
+
+ .p2align 4
+L(Exit16):
+ lea -1(%rdi), %rax
+ ret
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero):
+ add %rcx, %rdi
+ mov %rdx, %rcx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(prolog_find_zero_8)
+ test $0x01, %cl
+ jnz L(PrologFindZeroExit1)
+ test $0x02, %cl
+ jnz L(PrologFindZeroExit2)
+ test $0x04, %cl
+ jnz L(PrologFindZeroExit3)
+ and $1 << 4 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_8):
+ test $0x10, %cl
+ jnz L(PrologFindZeroExit5)
+ test $0x20, %cl
+ jnz L(PrologFindZeroExit6)
+ test $0x40, %cl
+ jnz L(PrologFindZeroExit7)
+ and $1 << 8 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(prolog_find_zero_high_8)
+ test $0x01, %ch
+ jnz L(PrologFindZeroExit9)
+ test $0x02, %ch
+ jnz L(PrologFindZeroExit10)
+ test $0x04, %ch
+ jnz L(PrologFindZeroExit11)
+ and $1 << 12 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(prolog_find_zero_high_8):
+ test $0x10, %ch
+ jnz L(PrologFindZeroExit13)
+ test $0x20, %ch
+ jnz L(PrologFindZeroExit14)
+ test $0x40, %ch
+ jnz L(PrologFindZeroExit15)
+ and $1 << 16 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit1):
+ and $1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit2):
+ and $1 << 2 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit3):
+ and $1 << 3 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit5):
+ and $1 << 5 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit6):
+ and $1 << 6 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit7):
+ and $1 << 7 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit9):
+ and $1 << 9 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit10):
+ and $1 << 10 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit11):
+ and $1 << 11 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit13):
+ and $1 << 13 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit14):
+ and $1 << 14 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(PrologFindZeroExit15):
+ and $1 << 15 - 1, %rax
+ jnz L(match_exit)
+ xor %rax, %rax
+ ret
+
END (strrchr)
weak_alias (strrchr, rindex)
--
1.8.3.2