This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH v1.1] Optimize strrchr more.


On Fri, Oct 04, 2013 at 10:15:22PM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> I played with my evolutionary algorithms to optimize various functions.
> It helped a bit but I got more by looking at code again because I
> noticed several oppurtunities that I missed.
> 
> First one is that best way to test if bytes are zero is by preparing
> zero register in advance, assume you have code
> 
> pxor %xmm3, %xmm3
> movdqa %xmm1, %xmm2
> pcmpeqb %xmm3, %xmm1 /*get zero mask */
> # do something with xmm2
> 
> Could be changed into
> 
> pxor %xmm1, %xmm1
> pcmpeqb %xmm2, %xmm1 /*get zero mask */
> # do something with xmm2
> 
> Second improvement is using 32-byte registers where benefical.
> 
> And last is that in previous iteration I did not trougthruly checked a loop
> generated by gcc. A cse pass added four extra moves that are useful only
> after exiting from loop. By removing these we on most architectures gained
> around 10% for large inputs.
> 
> Then there is evolver itself which rearanges scheduling to faster one
> but its effect is relatively small (1%-2%).
> 
> OK to commit?
> 
I introduced typo on manual editation, I tested wrong wrong register at 
exit from loop,

Here is fixed version.

 	* sysdeps/x86_64/strrchr.S: Optimize implementation.

diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 514765b..d532206 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -26,61 +26,72 @@
 
 	.text
 ENTRY (strrchr)
+	pxor	%xmm6, %xmm6
 	movd	%esi, %xmm1
+	pxor	%xmm7, %xmm7
 	movq	%rdi, %rax
+#ifdef USE_SSSE3
 	andl	$4095, %eax
-	punpcklbw	%xmm1, %xmm1
-	cmpq	$4032, %rax
-	punpcklwd	%xmm1, %xmm1
+	pxor	%xmm5, %xmm5
+	pxor	%xmm4, %xmm4
+	cmp	$4032, %eax
+	pshufb	%xmm7, %xmm1
+#else
+	punpcklbw %xmm1, %xmm1
+	andl	$4095, %eax
+	pxor	%xmm5, %xmm5
+	pxor	%xmm4, %xmm4
+	cmp	$4032, %eax
+	punpcklwd %xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
+#endif
 	ja	L(cross_page)
 	movdqu	(%rdi), %xmm0
-	pxor	%xmm2, %xmm2
-	movdqa	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb	%xmm0, %ecx
-	pmovmskb	%xmm3, %edx
-	testq	%rdx, %rdx
+	pmovmskb %xmm0, %ecx
+	pmovmskb %xmm4, %edx
+	test	%edx, %edx
 	je	L(next_48_bytes)
-	leaq	-1(%rdx), %rax
-	xorq	%rdx, %rax
-	andq	%rcx, %rax
+	lea	-1(%edx), %eax
+	xor	%edx, %eax
+	and	%ecx, %eax
 	je	L(exit)
-	bsrq	%rax, %rax
+	bsr	%eax, %eax
 	addq	%rdi, %rax
-	ret
+ ret
+
+L(exit):
+	xorl	%eax, %eax
+ ret
 
-	ALIGN(4)
+ .p2align 4
 L(next_48_bytes):
-	movdqu	16(%rdi), %xmm4
-	movdqa	%xmm4, %xmm5
 	movdqu	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm4
+	movdqu	48(%rdi), %xmm4
+	pcmpeqb	%xmm4, %xmm7
+	movdqu	16(%rdi), %xmm2
 	pcmpeqb	%xmm2, %xmm5
-	movdqu	48(%rdi), %xmm0
-	pmovmskb	%xmm5, %edx
-	movdqa	%xmm3, %xmm5
+	pmovmskb %xmm5, %eax
+	pcmpeqb	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm2, %esi
+	salq	$16, %rsi
 	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm5
-	pcmpeqb	%xmm0, %xmm2
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r8d
-	pmovmskb	%xmm5, %eax
-	pmovmskb	%xmm2, %esi
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %edx
+	orq	%rcx, %rsi
+	pmovmskb %xmm6, %ecx
+	pmovmskb %xmm3, %r8d
+	salq	$16, %rax
+	salq	$32, %rcx
 	salq	$32, %r8
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rdx, %rax
-	movq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
 	salq	$48, %rdx
-	salq	$16, %rsi
+	orq	%rcx, %rax
 	orq	%r8, %rsi
-	orq	%rcx, %rsi
-	pmovmskb	%xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rsi
+	orq	%rdx, %rsi
+	pmovmskb %xmm7, %edx
+	salq	$48, %rdx
 	orq	%rdx, %rax
 	je	L(loop_header2)
 	leaq	-1(%rax), %rcx
@@ -88,71 +99,69 @@ L(next_48_bytes):
 	andq	%rcx, %rsi
 	je	L(exit)
 	bsrq	%rsi, %rsi
-	leaq	(%rdi,%rsi), %rax
-	ret
+	leaq	(%rdi, %rsi), %rax
+	ret
 
-	ALIGN(4)
+	.p2align 3
 L(loop_header2):
 	testq	%rsi, %rsi
 	movq	%rdi, %rcx
-	je	L(no_c_found)
+	jne	L(loop_header)
+	movl	$1, %esi /* Evaluates to null.  */
+	xorl	%ecx, %ecx
 L(loop_header):
 	addq	$64, %rdi
 	pxor	%xmm7, %xmm7
 	andq	$-64, %rdi
 	jmp	L(loop_entry)
 
-	ALIGN(4)
+	.p2align 3
 L(loop64):
 	testq	%rdx, %rdx
 	cmovne	%rdx, %rsi
 	cmovne	%rdi, %rcx
 	addq	$64, %rdi
 L(loop_entry):
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm6, %xmm6
-	movdqa	48(%rdi), %xmm2
-	movdqa	%xmm3, %xmm0
-	movdqa	16(%rdi), %xmm4
-	pminub	%xmm2, %xmm0
-	movdqa	(%rdi), %xmm5
-	pminub	%xmm4, %xmm0
-	pminub	%xmm5, %xmm0
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb	%xmm0, %eax
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %r9d
-	movdqa	%xmm4, %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %edx
-	movdqa	%xmm3, %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	movdqa	16(%rdi), %xmm3
+	movdqa	48(%rdi), %xmm5
+	movdqa	(%rdi), %xmm2
+	movdqa	%xmm2, %xmm6
+	pminub	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	movdqa	32(%rdi), %xmm4
+	pminub	%xmm4, %xmm6
+	pminub	%xmm5, %xmm6
+	pmovmskb %xmm3, %edx
+	pcmpeqb	%xmm7, %xmm6
+	pmovmskb %xmm6, %eax
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb %xmm4, %r10d
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm1, %xmm2
+	pmovmskb %xmm5, %r8d
 	salq	$16, %rdx
-	pmovmskb	%xmm0, %r10d
-	movdqa	%xmm2, %xmm0
-	pcmpeqb	%xmm1, %xmm0
 	salq	$32, %r10
 	orq	%r10, %rdx
-	pmovmskb	%xmm0, %r8d
+	pmovmskb %xmm2, %r9d
 	orq	%r9, %rdx
 	salq	$48, %r8
 	orq	%r8, %rdx
 	testl	%eax, %eax
 	je	L(loop64)
-	pcmpeqb	%xmm6, %xmm4
-	pcmpeqb	%xmm6, %xmm3
-	pcmpeqb	%xmm6, %xmm5
-	pmovmskb	%xmm4, %eax
-	pmovmskb	%xmm3, %r10d
-	pcmpeqb	%xmm6, %xmm2
-	pmovmskb	%xmm5, %r9d
+	movdqa	32(%rdi), %xmm4
+	salq	$48, %rax
+	pcmpeqb	%xmm7, %xmm4
+	movdqa	16(%rdi), %xmm3
+	pmovmskb %xmm4, %r10d
+	pcmpeqb	%xmm7, %xmm3
+	movdqa	(%rdi), %xmm2
+	pmovmskb %xmm3, %r9d
 	salq	$32, %r10
-	salq	$16, %rax
-	pmovmskb	%xmm2, %r8d
+	pcmpeqb	%xmm7, %xmm2
+	pmovmskb %xmm2, %r8d
 	orq	%r10, %rax
+	salq	$16, %r9
 	orq	%r9, %rax
-	salq	$48, %r8
 	orq	%r8, %rax
 	leaq	-1(%rax), %r8
 	xorq	%rax, %r8
@@ -160,59 +169,50 @@ L(loop_entry):
 	cmovne	%rdi, %rcx
 	cmovne	%rdx, %rsi
 	bsrq	%rsi, %rsi
-	leaq	(%rcx,%rsi), %rax
-	ret
+	leaq	(%rcx, %rsi), %rax
+	ret
 
-	ALIGN(4)
-L(no_c_found):
-	movl	$1, %esi
-	xorl	%ecx, %ecx
-	jmp	L(loop_header)
 
-	ALIGN(4)
-L(exit):
-	xorl	%eax, %eax
-	ret
 
-	ALIGN(4)
+	.p2align 2
 L(cross_page):
-	movq	%rdi, %rax
 	pxor	%xmm0, %xmm0
+	movq	%rdi, %rax
 	andq	$-64, %rax
+	movdqu	48(%rax), %xmm2
 	movdqu	(%rax), %xmm5
-	movdqa	%xmm5, %xmm6
 	movdqu	16(%rax), %xmm4
-	pcmpeqb	%xmm1, %xmm5
+	movdqa	%xmm5, %xmm6
 	pcmpeqb	%xmm0, %xmm6
-	movdqu	32(%rax), %xmm3
-	pmovmskb	%xmm6, %esi
+	pmovmskb %xmm6, %esi
 	movdqa	%xmm4, %xmm6
-	movdqu	48(%rax), %xmm2
-	pcmpeqb	%xmm1, %xmm4
+	movdqu	32(%rax), %xmm3
 	pcmpeqb	%xmm0, %xmm6
-	pmovmskb	%xmm6, %edx
+	pmovmskb %xmm6, %edx
 	movdqa	%xmm3, %xmm6
-	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rdx
 	pcmpeqb	%xmm0, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pmovmskb %xmm6, %r8d
 	pcmpeqb	%xmm2, %xmm0
-	salq	$16, %rdx
-	pmovmskb	%xmm3, %r9d
-	pmovmskb	%xmm6, %r8d
-	pmovmskb	%xmm0, %ecx
-	salq	$32, %r9
+	pmovmskb %xmm0, %ecx
+	pmovmskb %xmm3, %r9d
+	salq	$48, %rcx
 	salq	$32, %r8
-	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm1, %xmm4
 	orq	%r8, %rdx
-	salq	$48, %rcx
-	pmovmskb	%xmm5, %r8d
+	pcmpeqb	%xmm1, %xmm2
 	orq	%rsi, %rdx
-	pmovmskb	%xmm4, %esi
+	pmovmskb %xmm4, %esi
 	orq	%rcx, %rdx
-	pmovmskb	%xmm2, %ecx
+	pmovmskb %xmm2, %ecx
+	pcmpeqb	%xmm1, %xmm5
 	salq	$16, %rsi
-	salq	$48, %rcx
+	pmovmskb %xmm5, %r8d
+	salq	$32, %r9
 	orq	%r9, %rsi
 	orq	%r8, %rsi
+	salq	$48, %rcx
 	orq	%rcx, %rsi
 	movl	%edi, %ecx
 	subl	%eax, %ecx
@@ -226,7 +226,7 @@ L(cross_page):
 	je	L(exit)
 	bsrq	%rsi, %rax
 	addq	%rdi, %rax
-	ret
+	ret
 END (strrchr)
 
 weak_alias (strrchr, rindex)
-- 
1.8.4.rc3

 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]