This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH 1/2] Fix strrchr regression.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Mon, 5 Aug 2013 19:33:46 +0200
Subject: [PATCH 1/2] Fix strrchr regression.
Hi, 

I intentionally waited with this patch to check if there is somebody
else who reads benchmark results. 

I knew for two years that using SSE4.2 for strrchr causes performance regression 
on all architectures that support it. I also checked that it is present
in benchmark outputs on nehalem but nobody noticed.

Also nobody noticed that results do not make lot of sense. It is quite
weird that byte-by byte implementation is slower for 32 bytes than when
it checks 2048 bytes, isn't it?

               simple_strrchr  __strrchr_sse42 __strrchr_sse2_no_bsf __strrchr_sse2
Length   32, alignment in bytes  0: 3929  353 320 341
Length   32, alignment in bytes  1: 3932  353 293 341
Length   64, alignment in bytes  0: 3923  359 299 341
Length   64, alignment in bytes  2: 3929  359 293 341
Length  128, alignment in bytes  0: 3929  359 296 341
Length  128, alignment in bytes  3: 3917  356 287 338
Length  256, alignment in bytes  0: 3929  405 308 341
Length  256, alignment in bytes  4: 3932  365 296 344
Length  512, alignment in bytes  0: 3929  378 305 341
Length  512, alignment in bytes  5: 3929  399 308 344
Length 1024, alignment in bytes  0: 3932  420 305 344
Length 1024, alignment in bytes  6: 3757  396 302 362
Length 2048, alignment in bytes  0: 3633  362 320 338
Length 2048, alignment in bytes  7: 3633  356 314 338

To get reliable results I added strrchr to my profiler. You can clearly
see asymptotic behavior of strrchr here.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html

Actually there are two regressions here. First one is that strchr_sse42
is also about 14% asymptotically slower in sandy/ivy bridge and around
3% slower for fx10. 

Second is more serious, on all architectures that I tested an
__strrchr_sse2_no_bsf came asymptotically faster than __strrchr_sse2 and
consecutively __strrchr_sse42. 

Given these regressions it is viable to remove  __strrchr_sse42 and __strrchr_sse2
and use __strrchr_sse2_no_bsf as base implementation. 

We can accept this by same reasoning that lead us to choose
__strrchr_sse2 over _strrchr_sse42 for silvermont architecture.

I will send second part that optimizes strrchr implementation later.

---
 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 ------------------------
 sysdeps/x86_64/multiarch/strrchr.S             | 288 ------------
 sysdeps/x86_64/strrchr.S                       | 579 ++++++++++++++++++++++---
 3 files changed, 527 insertions(+), 895 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
deleted file mode 100644
index fcef610..0000000
--- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
+++ /dev/null
@@ -1,555 +0,0 @@
-/* strrchr with SSE2 without bsf and bsr
-   Copyright (C) 2011-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-# include <sysdep.h>
-# include "asm-syntax.h"
-
-	atom_text_section
-ENTRY (__strrchr_sse2_no_bsf)
-
-	movd	%rsi, %xmm1
-	pxor	%xmm2, %xmm2
-	mov	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	punpcklbw %xmm1, %xmm1
-	/* ECX has OFFSET. */
-	and	$63, %rcx
-	cmp	$48, %rcx
-	pshufd	$0, %xmm1, %xmm1
-	ja	L(crosscache)
-
-/* unaligned string. */
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %rcx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match1)
-
-	test	%rcx, %rcx
-	jnz	L(return_null)
-
-	and	$-16, %rdi
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match1):
-	test	%rcx, %rcx
-	jnz	L(prolog_find_zero_1)
-
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	and	$-16, %rdi
-	jmp	L(loop)
-
-	.p2align 4
-L(crosscache):
-/* Hancle unaligned string.  */
-	and	$15, %rcx
-	and	$-16, %rdi
-	pxor	%xmm3, %xmm3
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm3, %rdx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %rax
-	/* Remove the leading bytes.  */
-	shr	%cl, %rdx
-	shr	%cl, %rax
-	add	$16, %rdi
-
-	test	%rax, %rax
-	jnz	L(unaligned_match)
-
-	test	%rdx, %rdx
-	jnz	L(return_null)
-
-	xor	%r8, %r8
-	jmp	L(loop)
-
-	.p2align 4
-L(unaligned_match):
-	test	%rdx, %rdx
-	jnz	L(prolog_find_zero)
-
-	mov	%rax, %r8
-	lea	(%rdi, %rcx), %rsi
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm2, %rcx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rcx
-	jz	L(loop)
-
-L(matches):
-	test	%rax, %rax
-	jnz	L(match)
-L(return_value):
-	test	%r8, %r8
-	jz	L(return_null)
-	mov	%r8, %rax
-	mov	%rsi, %rdi
-	jmp	L(match_exit)
-
-	.p2align 4
-L(match):
-	pmovmskb %xmm2, %rcx
-	test	%rcx, %rcx
-	jnz	L(find_zero)
-	mov	%rax, %r8
-	mov	%rdi, %rsi
-	jmp	L(loop)
-
-	.p2align 4
-L(find_zero):
-	test	%cl, %cl
-	jz	L(find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(find_zero_8)
-	test	$0x01, %cl
-	jnz	L(FindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(FindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(FindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_8):
-	test	$0x10, %cl
-	jnz	L(FindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(FindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(FindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(FindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(FindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(FindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(FindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(FindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(FindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit1):
-	and	$1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jz	L(return_value)
-	jmp	L(match_exit)
-
-	.p2align 4
-L(FindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jz	L(return_value)
-
-	.p2align 4
-L(match_exit):
-	test	%ah, %ah
-	jnz	L(match_exit_high)
-	mov	%al, %dl
-	and	$15 << 4, %dl
-	jnz	L(match_exit_8)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	lea	-16(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_8):
-	test	$0x80, %al
-	jnz	L(Exit8)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	lea	-12(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high):
-	mov	%ah, %dh
-	and	$15 << 4, %dh
-	jnz	L(match_exit_high_8)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	lea	-8(%rdi), %rax
-	ret
-
-	.p2align 4
-L(match_exit_high_8):
-	test	$0x80, %ah
-	jnz	L(Exit16)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	lea	-4(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit2):
-	lea	-15(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit3):
-	lea	-14(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit4):
-	lea	-13(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit6):
-	lea	-11(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit7):
-	lea	-10(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit8):
-	lea	-9(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit10):
-	lea	-7(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit11):
-	lea	-6(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit12):
-	lea	-5(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit14):
-	lea	-3(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit15):
-	lea	-2(%rdi), %rax
-	ret
-
-	.p2align 4
-L(Exit16):
-	lea	-1(%rdi), %rax
-	ret
-
-/* Return NULL.  */
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero):
-	add	%rcx, %rdi
-	mov     %rdx, %rcx
-L(prolog_find_zero_1):
-	test	%cl, %cl
-	jz	L(prolog_find_zero_high)
-	mov	%cl, %dl
-	and	$15, %dl
-	jz	L(prolog_find_zero_8)
-	test	$0x01, %cl
-	jnz	L(PrologFindZeroExit1)
-	test	$0x02, %cl
-	jnz	L(PrologFindZeroExit2)
-	test	$0x04, %cl
-	jnz	L(PrologFindZeroExit3)
-	and	$1 << 4 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_8):
-	test	$0x10, %cl
-	jnz	L(PrologFindZeroExit5)
-	test	$0x20, %cl
-	jnz	L(PrologFindZeroExit6)
-	test	$0x40, %cl
-	jnz	L(PrologFindZeroExit7)
-	and	$1 << 8 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high):
-	mov	%ch, %dh
-	and	$15, %dh
-	jz	L(prolog_find_zero_high_8)
-	test	$0x01, %ch
-	jnz	L(PrologFindZeroExit9)
-	test	$0x02, %ch
-	jnz	L(PrologFindZeroExit10)
-	test	$0x04, %ch
-	jnz	L(PrologFindZeroExit11)
-	and	$1 << 12 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(prolog_find_zero_high_8):
-	test	$0x10, %ch
-	jnz	L(PrologFindZeroExit13)
-	test	$0x20, %ch
-	jnz	L(PrologFindZeroExit14)
-	test	$0x40, %ch
-	jnz	L(PrologFindZeroExit15)
-	and	$1 << 16 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit1):
-	and	$1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit2):
-	and	$1 << 2 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit3):
-	and	$1 << 3 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit5):
-	and	$1 << 5 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit6):
-	and	$1 << 6 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit7):
-	and	$1 << 7 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit9):
-	and	$1 << 9 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit10):
-	and	$1 << 10 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit11):
-	and	$1 << 11 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit13):
-	and	$1 << 13 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit14):
-	and	$1 << 14 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-	.p2align 4
-L(PrologFindZeroExit15):
-	and	$1 << 15 - 1, %rax
-	jnz	L(match_exit)
-	xor	%rax, %rax
-	ret
-
-END (__strrchr_sse2_no_bsf)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
deleted file mode 100644
index 3f92a41..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* Multiple versions of strrchr
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
-   the DSO.  In static binaries we need strrchr before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.text
-ENTRY(strrchr)
-	.type	strrchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strrchr_sse2(%rip), %rax
-	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
-	jnz	2f
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strrchr_sse42(%rip), %rax
-	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strrchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strrchr)
-
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the last occurrence of the character c in the
-   string s:
-
-   char *strrchr (const char *s, int c);
-
-   We use 0x4a:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_MOST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   last offset.  There are 4 cases:
-
-   1. The first 16byte data element has EOS and has the byte C at the
-      last offset X.
-   2. The first 16byte data element is valid and has the byte C at the
-      last offset X.
-   3. The first 16byte data element has EOS and doesn't have the byte C.
-   4. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	  1	  0
-    2		 X	  1	  0	  0
-    3		16	  0	  1	  0
-    4		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 3 with jz which branches
-   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
-
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strrchr_sse42, @function
-	.globl	__strrchr_sse42
-	.hidden	__strrchr_sse42
-__strrchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
-	movd	%esi, %xmm1
-	punpcklbw	%xmm1, %xmm1
-	movl	%edi, %esi
-	punpcklbw	%xmm1, %xmm1
-	andl	$15, %esi
-	pshufd	$0, %xmm1, %xmm1
-	movq	%rdi, %r8
-	je	L(loop)
-
-/* Handle unaligned string using psrldq.  */
-	leaq	L(psrldq_table)(%rip), %rdx
-	andq	$-16, %r8
-	movslq	(%rdx,%rsi,4),%r9
-	movdqa	(%r8), %xmm0
-	addq	%rdx, %r9
-	jmp	*%r9
-
-/* Handle unaligned string with offset 1 using psrldq.  */
-	.p2align 4
-L(psrldq_1):
-	psrldq	$1, %xmm0
-
-	.p2align 4
-L(unaligned_pcmpistri):
-	pcmpistri	$0x4a, %xmm1, %xmm0
-	jnc	L(unaligned_no_byte)
-	leaq	(%rdi,%rcx), %rax
-L(unaligned_no_byte):
-	/* Find the length of the unaligned string.  */
-	pcmpistri	$0x3a, %xmm0, %xmm0
-	movl	$16, %edx
-	subl	%esi, %edx
-	cmpl	%ecx, %edx
-	/* Return RAX if the unaligned fragment to next 16B already
-	   contain the NULL terminator.  */
-	jg	L(exit)
-	addq	$16, %r8
-
-/* Loop start on aligned string.  */
-	.p2align 4
-L(loop):
-	pcmpistri	$0x4a, (%r8), %xmm1
-	jbe	L(match_or_eos)
-	addq	$16, %r8
-	jmp	L(loop)
-	.p2align 4
-L(match_or_eos):
-	je	L(had_eos)
-L(match_no_eos):
-	leaq	(%r8,%rcx), %rax
-	addq	$16, %r8
-	jmp     L(loop)
-	.p2align 4
-L(had_eos):
-	jnc     L(exit)
-	leaq	(%r8,%rcx), %rax
-	.p2align 4
-L(exit):
-	ret
-
-/* Handle unaligned string with offset 15 using psrldq.  */
-	.p2align 4
-L(psrldq_15):
-	psrldq	$15, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 14 using psrldq.  */
-	.p2align 4
-L(psrldq_14):
-	psrldq	$14, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 13 using psrldq.  */
-	.p2align 4
-L(psrldq_13):
-	psrldq	$13, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 12 using psrldq.  */
-	.p2align 4
-L(psrldq_12):
-	psrldq	$12, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 11 using psrldq.  */
-	.p2align 4
-L(psrldq_11):
-	psrldq	$11, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 10 using psrldq.  */
-	.p2align 4
-L(psrldq_10):
-	psrldq	$10, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 9 using psrldq.  */
-	.p2align 4
-L(psrldq_9):
-	psrldq	$9, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 8 using psrldq.  */
-	.p2align 4
-L(psrldq_8):
-	psrldq	$8, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 7 using psrldq.  */
-	.p2align 4
-L(psrldq_7):
-	psrldq	$7, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 6 using psrldq.  */
-	.p2align 4
-L(psrldq_6):
-	psrldq	$6, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 5 using psrldq.  */
-	.p2align 4
-L(psrldq_5):
-	psrldq	$5, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 4 using psrldq.  */
-	.p2align 4
-L(psrldq_4):
-	psrldq	$4, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 3 using psrldq.  */
-	.p2align 4
-L(psrldq_3):
-	psrldq	$3, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-/* Handle unaligned string with offset 2 using psrldq.  */
-	.p2align 4
-L(psrldq_2):
-	psrldq	$2, %xmm0
-	jmp	L(unaligned_pcmpistri)
-
-	cfi_endproc
-	.size	__strrchr_sse42, .-__strrchr_sse42
-
-	.section .rodata.sse4.2,"a",@progbits
-	.p2align 4
-L(psrldq_table):
-	.int	L(loop) - L(psrldq_table)
-	.int	L(psrldq_1) - L(psrldq_table)
-	.int	L(psrldq_2) - L(psrldq_table)
-	.int	L(psrldq_3) - L(psrldq_table)
-	.int	L(psrldq_4) - L(psrldq_table)
-	.int	L(psrldq_5) - L(psrldq_table)
-	.int	L(psrldq_6) - L(psrldq_table)
-	.int	L(psrldq_7) - L(psrldq_table)
-	.int	L(psrldq_8) - L(psrldq_table)
-	.int	L(psrldq_9) - L(psrldq_table)
-	.int	L(psrldq_10) - L(psrldq_table)
-	.int	L(psrldq_11) - L(psrldq_table)
-	.int	L(psrldq_12) - L(psrldq_table)
-	.int	L(psrldq_13) - L(psrldq_table)
-	.int	L(psrldq_14) - L(psrldq_table)
-	.int	L(psrldq_15) - L(psrldq_table)
-
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strrchr_sse2, @function; \
-	.align 16; \
-	.globl __strrchr_sse2; \
-	.hidden __strrchr_sse2; \
-	__strrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
-#endif
-
-#include "../strrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index e413b07..3bde291 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -1,6 +1,6 @@
-/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+/* strrchr with SSE2 without bsf and bsr
+   Copyright (C) 2011-2013 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,63 +17,538 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
+#include <sysdep.h>
+#include "asm-syntax.h"
 
-	.text
 ENTRY (strrchr)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
+
+	movd	%rsi, %xmm1
 	pxor	%xmm2, %xmm2
+	mov	%rdi, %rcx
 	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
+	punpcklbw %xmm1, %xmm1
+	/* ECX has OFFSET. */
+	and	$63, %rcx
+	cmp	$48, %rcx
 	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
+	ja	L(crosscache)
+
+/* unaligned string. */
+	movdqu	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %rcx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match1)
+
+	test	%rcx, %rcx
+	jnz	L(return_null)
+
+	and	$-16, %rdi
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%rcx, %rcx
+	jnz	L(prolog_find_zero_1)
+
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	and	$-16, %rdi
+	jmp	L(loop)
+
+	.p2align 4
+L(crosscache):
+/* Hancle unaligned string.  */
+	and	$15, %rcx
+	and	$-16, %rdi
+	pxor	%xmm3, %xmm3
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm3, %rdx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %rax
+	/* Remove the leading bytes.  */
+	shr	%cl, %rdx
+	shr	%cl, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match)
+
+	test	%rdx, %rdx
+	jnz	L(return_null)
+
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match):
+	test	%rdx, %rdx
+	jnz	L(prolog_find_zero)
+
+	mov	%rax, %r8
+	lea	(%rdi, %rcx), %rsi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %rdi
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	xorl	%eax, %eax
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %rdi
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	movl	%edx, %esi
-	orl	%ecx, %esi
-	jz	2b
-
-1:	bsfl	%ecx, %r9d
-	movl	$0xffffffff, %r8d
-	movl	$31, %ecx
-	jnz	5f
-
-	bsrl	%edx, %edx
-	jz	2b
-	leaq	-16(%rdi,%rdx), %rax
-	jmp	2b
-
-5:	subl	%r9d, %ecx
-	shrl	%cl, %r8d
-	andl	%r8d, %edx
-	bsrl	%edx, %edx
-	jz	4f
-	leaq	-16(%rdi,%rdx), %rax
-4:	ret
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jz	L(loop)
+
+L(matches):
+	test	%rax, %rax
+	jnz	L(match)
+L(return_value):
+	test	%r8, %r8
+	jz	L(return_null)
+	mov	%r8, %rax
+	mov	%rsi, %rdi
+	jmp	L(match_exit)
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %rcx
+	test	%rcx, %rcx
+	jnz	L(find_zero)
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	%cl, %cl
+	jz	L(find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(find_zero_8)
+	test	$0x01, %cl
+	jnz	L(FindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(FindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(FindZeroExit3)
+	and	$1 << 4 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(find_zero_8):
+	test	$0x10, %cl
+	jnz	L(FindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(FindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(FindZeroExit7)
+	and	$1 << 8 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(FindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(FindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(FindZeroExit11)
+	and	$1 << 12 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(FindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(FindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(FindZeroExit15)
+	and	$1 << 16 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit1):
+	and	$1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit2):
+	and	$1 << 2 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit3):
+	and	$1 << 3 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit5):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit6):
+	and	$1 << 6 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit7):
+	and	$1 << 7 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit9):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit10):
+	and	$1 << 10 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit11):
+	and	$1 << 11 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit13):
+	and	$1 << 13 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit14):
+	and	$1 << 14 - 1, %rax
+	jz	L(return_value)
+	jmp	L(match_exit)
+
+	.p2align 4
+L(FindZeroExit15):
+	and	$1 << 15 - 1, %rax
+	jz	L(return_value)
+
+	.p2align 4
+L(match_exit):
+	test	%ah, %ah
+	jnz	L(match_exit_high)
+	mov	%al, %dl
+	and	$15 << 4, %dl
+	jnz	L(match_exit_8)
+	test	$0x08, %al
+	jnz	L(Exit4)
+	test	$0x04, %al
+	jnz	L(Exit3)
+	test	$0x02, %al
+	jnz	L(Exit2)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_exit_8):
+	test	$0x80, %al
+	jnz	L(Exit8)
+	test	$0x40, %al
+	jnz	L(Exit7)
+	test	$0x20, %al
+	jnz	L(Exit6)
+	lea	-12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_exit_high):
+	mov	%ah, %dh
+	and	$15 << 4, %dh
+	jnz	L(match_exit_high_8)
+	test	$0x08, %ah
+	jnz	L(Exit12)
+	test	$0x04, %ah
+	jnz	L(Exit11)
+	test	$0x02, %ah
+	jnz	L(Exit10)
+	lea	-8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_exit_high_8):
+	test	$0x80, %ah
+	jnz	L(Exit16)
+	test	$0x40, %ah
+	jnz	L(Exit15)
+	test	$0x20, %ah
+	jnz	L(Exit14)
+	lea	-4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit2):
+	lea	-15(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit3):
+	lea	-14(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	lea	-13(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit6):
+	lea	-11(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit7):
+	lea	-10(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit8):
+	lea	-9(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit10):
+	lea	-7(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit11):
+	lea	-6(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	lea	-5(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit14):
+	lea	-3(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit15):
+	lea	-2(%rdi), %rax
+	ret
+
+	.p2align 4
+L(Exit16):
+	lea	-1(%rdi), %rax
+	ret
+
+/* Return NULL.  */
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%rcx, %rdi
+	mov     %rdx, %rcx
+L(prolog_find_zero_1):
+	test	%cl, %cl
+	jz	L(prolog_find_zero_high)
+	mov	%cl, %dl
+	and	$15, %dl
+	jz	L(prolog_find_zero_8)
+	test	$0x01, %cl
+	jnz	L(PrologFindZeroExit1)
+	test	$0x02, %cl
+	jnz	L(PrologFindZeroExit2)
+	test	$0x04, %cl
+	jnz	L(PrologFindZeroExit3)
+	and	$1 << 4 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_8):
+	test	$0x10, %cl
+	jnz	L(PrologFindZeroExit5)
+	test	$0x20, %cl
+	jnz	L(PrologFindZeroExit6)
+	test	$0x40, %cl
+	jnz	L(PrologFindZeroExit7)
+	and	$1 << 8 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_high):
+	mov	%ch, %dh
+	and	$15, %dh
+	jz	L(prolog_find_zero_high_8)
+	test	$0x01, %ch
+	jnz	L(PrologFindZeroExit9)
+	test	$0x02, %ch
+	jnz	L(PrologFindZeroExit10)
+	test	$0x04, %ch
+	jnz	L(PrologFindZeroExit11)
+	and	$1 << 12 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_high_8):
+	test	$0x10, %ch
+	jnz	L(PrologFindZeroExit13)
+	test	$0x20, %ch
+	jnz	L(PrologFindZeroExit14)
+	test	$0x40, %ch
+	jnz	L(PrologFindZeroExit15)
+	and	$1 << 16 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit1):
+	and	$1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit2):
+	and	$1 << 2 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit3):
+	and	$1 << 3 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit5):
+	and	$1 << 5 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit6):
+	and	$1 << 6 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit7):
+	and	$1 << 7 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit9):
+	and	$1 << 9 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit10):
+	and	$1 << 10 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit11):
+	and	$1 << 11 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit13):
+	and	$1 << 13 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit14):
+	and	$1 << 14 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
+	.p2align 4
+L(PrologFindZeroExit15):
+	and	$1 << 15 - 1, %rax
+	jnz	L(match_exit)
+	xor	%rax, %rax
+	ret
+
 END (strrchr)
 
 weak_alias (strrchr, rindex)
-- 
1.8.3.2
Follow-Ups:
- Re: [PATCH 1.5/2] Fix strrchr regression.
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]