This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCH v1.1] Optimize strrchr more.


ping
On Sat, Oct 05, 2013 at 08:39:32AM +0200, OndÅej BÃlka wrote:
> On Fri, Oct 04, 2013 at 10:15:22PM +0200, OndÅej BÃlka wrote:
> > Hi,
> > 
> > I played with my evolutionary algorithms to optimize various functions.
> > It helped a bit but I got more by looking at code again because I
> > noticed several oppurtunities that I missed.
> > 
> > First one is that best way to test if bytes are zero is by preparing
> > zero register in advance, assume you have code
> > 
> > pxor %xmm3, %xmm3
> > movdqa %xmm1, %xmm2
> > pcmpeqb %xmm3, %xmm1 /*get zero mask */
> > # do something with xmm2
> > 
> > Could be changed into
> > 
> > pxor %xmm1, %xmm1
> > pcmpeqb %xmm2, %xmm1 /*get zero mask */
> > # do something with xmm2
> > 
> > Second improvement is using 32-byte registers where benefical.
> > 
> > And last is that in previous iteration I did not trougthruly checked a loop
> > generated by gcc. A cse pass added four extra moves that are useful only
> > after exiting from loop. By removing these we on most architectures gained
> > around 10% for large inputs.
> > 
> > Then there is evolver itself which rearanges scheduling to faster one
> > but its effect is relatively small (1%-2%).
> > 
> > OK to commit?
> > 
> I introduced typo on manual editation, I tested wrong wrong register at 
> exit from loop,
> 
> Here is fixed version.
> 
>  	* sysdeps/x86_64/strrchr.S: Optimize implementation.
> 
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 514765b..d532206 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -26,61 +26,72 @@
>  
>  	.text
>  ENTRY (strrchr)
> +	pxor	%xmm6, %xmm6
>  	movd	%esi, %xmm1
> +	pxor	%xmm7, %xmm7
>  	movq	%rdi, %rax
> +#ifdef USE_SSSE3
>  	andl	$4095, %eax
> -	punpcklbw	%xmm1, %xmm1
> -	cmpq	$4032, %rax
> -	punpcklwd	%xmm1, %xmm1
> +	pxor	%xmm5, %xmm5
> +	pxor	%xmm4, %xmm4
> +	cmp	$4032, %eax
> +	pshufb	%xmm7, %xmm1
> +#else
> +	punpcklbw %xmm1, %xmm1
> +	andl	$4095, %eax
> +	pxor	%xmm5, %xmm5
> +	pxor	%xmm4, %xmm4
> +	cmp	$4032, %eax
> +	punpcklwd %xmm1, %xmm1
>  	pshufd	$0, %xmm1, %xmm1
> +#endif
>  	ja	L(cross_page)
>  	movdqu	(%rdi), %xmm0
> -	pxor	%xmm2, %xmm2
> -	movdqa	%xmm0, %xmm3
> +	pcmpeqb	%xmm0, %xmm4
>  	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm2, %xmm3
> -	pmovmskb	%xmm0, %ecx
> -	pmovmskb	%xmm3, %edx
> -	testq	%rdx, %rdx
> +	pmovmskb %xmm0, %ecx
> +	pmovmskb %xmm4, %edx
> +	test	%edx, %edx
>  	je	L(next_48_bytes)
> -	leaq	-1(%rdx), %rax
> -	xorq	%rdx, %rax
> -	andq	%rcx, %rax
> +	lea	-1(%edx), %eax
> +	xor	%edx, %eax
> +	and	%ecx, %eax
>  	je	L(exit)
> -	bsrq	%rax, %rax
> +	bsr	%eax, %eax
>  	addq	%rdi, %rax
> -	ret
> + ret
> +
> +L(exit):
> +	xorl	%eax, %eax
> + ret
>  
> -	ALIGN(4)
> + .p2align 4
>  L(next_48_bytes):
> -	movdqu	16(%rdi), %xmm4
> -	movdqa	%xmm4, %xmm5
>  	movdqu	32(%rdi), %xmm3
> -	pcmpeqb	%xmm1, %xmm4
> +	movdqu	48(%rdi), %xmm4
> +	pcmpeqb	%xmm4, %xmm7
> +	movdqu	16(%rdi), %xmm2
>  	pcmpeqb	%xmm2, %xmm5
> -	movdqu	48(%rdi), %xmm0
> -	pmovmskb	%xmm5, %edx
> -	movdqa	%xmm3, %xmm5
> +	pmovmskb %xmm5, %eax
> +	pcmpeqb	%xmm3, %xmm6
> +	pcmpeqb	%xmm1, %xmm2
> +	pmovmskb %xmm2, %esi
> +	salq	$16, %rsi
>  	pcmpeqb	%xmm1, %xmm3
> -	pcmpeqb	%xmm2, %xmm5
> -	pcmpeqb	%xmm0, %xmm2
> -	salq	$16, %rdx
> -	pmovmskb	%xmm3, %r8d
> -	pmovmskb	%xmm5, %eax
> -	pmovmskb	%xmm2, %esi
> +	pcmpeqb	%xmm1, %xmm4
> +	pmovmskb %xmm4, %edx
> +	orq	%rcx, %rsi
> +	pmovmskb %xmm6, %ecx
> +	pmovmskb %xmm3, %r8d
> +	salq	$16, %rax
> +	salq	$32, %rcx
>  	salq	$32, %r8
> -	salq	$32, %rax
> -	pcmpeqb	%xmm1, %xmm0
> -	orq	%rdx, %rax
> -	movq	%rsi, %rdx
> -	pmovmskb	%xmm4, %esi
>  	salq	$48, %rdx
> -	salq	$16, %rsi
> +	orq	%rcx, %rax
>  	orq	%r8, %rsi
> -	orq	%rcx, %rsi
> -	pmovmskb	%xmm0, %ecx
> -	salq	$48, %rcx
> -	orq	%rcx, %rsi
> +	orq	%rdx, %rsi
> +	pmovmskb %xmm7, %edx
> +	salq	$48, %rdx
>  	orq	%rdx, %rax
>  	je	L(loop_header2)
>  	leaq	-1(%rax), %rcx
> @@ -88,71 +99,69 @@ L(next_48_bytes):
>  	andq	%rcx, %rsi
>  	je	L(exit)
>  	bsrq	%rsi, %rsi
> -	leaq	(%rdi,%rsi), %rax
> -	ret
> +	leaq	(%rdi, %rsi), %rax
> +	ret
>  
> -	ALIGN(4)
> +	.p2align 3
>  L(loop_header2):
>  	testq	%rsi, %rsi
>  	movq	%rdi, %rcx
> -	je	L(no_c_found)
> +	jne	L(loop_header)
> +	movl	$1, %esi /* Evaluates to null.  */
> +	xorl	%ecx, %ecx
>  L(loop_header):
>  	addq	$64, %rdi
>  	pxor	%xmm7, %xmm7
>  	andq	$-64, %rdi
>  	jmp	L(loop_entry)
>  
> -	ALIGN(4)
> +	.p2align 3
>  L(loop64):
>  	testq	%rdx, %rdx
>  	cmovne	%rdx, %rsi
>  	cmovne	%rdi, %rcx
>  	addq	$64, %rdi
>  L(loop_entry):
> -	movdqa	32(%rdi), %xmm3
> -	pxor	%xmm6, %xmm6
> -	movdqa	48(%rdi), %xmm2
> -	movdqa	%xmm3, %xmm0
> -	movdqa	16(%rdi), %xmm4
> -	pminub	%xmm2, %xmm0
> -	movdqa	(%rdi), %xmm5
> -	pminub	%xmm4, %xmm0
> -	pminub	%xmm5, %xmm0
> -	pcmpeqb	%xmm7, %xmm0
> -	pmovmskb	%xmm0, %eax
> -	movdqa	%xmm5, %xmm0
> -	pcmpeqb	%xmm1, %xmm0
> -	pmovmskb	%xmm0, %r9d
> -	movdqa	%xmm4, %xmm0
> -	pcmpeqb	%xmm1, %xmm0
> -	pmovmskb	%xmm0, %edx
> -	movdqa	%xmm3, %xmm0
> -	pcmpeqb	%xmm1, %xmm0
> +	movdqa	16(%rdi), %xmm3
> +	movdqa	48(%rdi), %xmm5
> +	movdqa	(%rdi), %xmm2
> +	movdqa	%xmm2, %xmm6
> +	pminub	%xmm3, %xmm6
> +	pcmpeqb	%xmm1, %xmm3
> +	movdqa	32(%rdi), %xmm4
> +	pminub	%xmm4, %xmm6
> +	pminub	%xmm5, %xmm6
> +	pmovmskb %xmm3, %edx
> +	pcmpeqb	%xmm7, %xmm6
> +	pmovmskb %xmm6, %eax
> +	pcmpeqb	%xmm1, %xmm4
> +	pmovmskb %xmm4, %r10d
> +	pcmpeqb	%xmm1, %xmm5
> +	pcmpeqb	%xmm1, %xmm2
> +	pmovmskb %xmm5, %r8d
>  	salq	$16, %rdx
> -	pmovmskb	%xmm0, %r10d
> -	movdqa	%xmm2, %xmm0
> -	pcmpeqb	%xmm1, %xmm0
>  	salq	$32, %r10
>  	orq	%r10, %rdx
> -	pmovmskb	%xmm0, %r8d
> +	pmovmskb %xmm2, %r9d
>  	orq	%r9, %rdx
>  	salq	$48, %r8
>  	orq	%r8, %rdx
>  	testl	%eax, %eax
>  	je	L(loop64)
> -	pcmpeqb	%xmm6, %xmm4
> -	pcmpeqb	%xmm6, %xmm3
> -	pcmpeqb	%xmm6, %xmm5
> -	pmovmskb	%xmm4, %eax
> -	pmovmskb	%xmm3, %r10d
> -	pcmpeqb	%xmm6, %xmm2
> -	pmovmskb	%xmm5, %r9d
> +	movdqa	32(%rdi), %xmm4
> +	salq	$48, %rax
> +	pcmpeqb	%xmm7, %xmm4
> +	movdqa	16(%rdi), %xmm3
> +	pmovmskb %xmm4, %r10d
> +	pcmpeqb	%xmm7, %xmm3
> +	movdqa	(%rdi), %xmm2
> +	pmovmskb %xmm3, %r9d
>  	salq	$32, %r10
> -	salq	$16, %rax
> -	pmovmskb	%xmm2, %r8d
> +	pcmpeqb	%xmm7, %xmm2
> +	pmovmskb %xmm2, %r8d
>  	orq	%r10, %rax
> +	salq	$16, %r9
>  	orq	%r9, %rax
> -	salq	$48, %r8
>  	orq	%r8, %rax
>  	leaq	-1(%rax), %r8
>  	xorq	%rax, %r8
> @@ -160,59 +169,50 @@ L(loop_entry):
>  	cmovne	%rdi, %rcx
>  	cmovne	%rdx, %rsi
>  	bsrq	%rsi, %rsi
> -	leaq	(%rcx,%rsi), %rax
> -	ret
> +	leaq	(%rcx, %rsi), %rax
> +	ret
>  
> -	ALIGN(4)
> -L(no_c_found):
> -	movl	$1, %esi
> -	xorl	%ecx, %ecx
> -	jmp	L(loop_header)
>  
> -	ALIGN(4)
> -L(exit):
> -	xorl	%eax, %eax
> -	ret
>  
> -	ALIGN(4)
> +	.p2align 2
>  L(cross_page):
> -	movq	%rdi, %rax
>  	pxor	%xmm0, %xmm0
> +	movq	%rdi, %rax
>  	andq	$-64, %rax
> +	movdqu	48(%rax), %xmm2
>  	movdqu	(%rax), %xmm5
> -	movdqa	%xmm5, %xmm6
>  	movdqu	16(%rax), %xmm4
> -	pcmpeqb	%xmm1, %xmm5
> +	movdqa	%xmm5, %xmm6
>  	pcmpeqb	%xmm0, %xmm6
> -	movdqu	32(%rax), %xmm3
> -	pmovmskb	%xmm6, %esi
> +	pmovmskb %xmm6, %esi
>  	movdqa	%xmm4, %xmm6
> -	movdqu	48(%rax), %xmm2
> -	pcmpeqb	%xmm1, %xmm4
> +	movdqu	32(%rax), %xmm3
>  	pcmpeqb	%xmm0, %xmm6
> -	pmovmskb	%xmm6, %edx
> +	pmovmskb %xmm6, %edx
>  	movdqa	%xmm3, %xmm6
> -	pcmpeqb	%xmm1, %xmm3
> +	salq	$16, %rdx
>  	pcmpeqb	%xmm0, %xmm6
> +	pcmpeqb	%xmm1, %xmm3
> +	pmovmskb %xmm6, %r8d
>  	pcmpeqb	%xmm2, %xmm0
> -	salq	$16, %rdx
> -	pmovmskb	%xmm3, %r9d
> -	pmovmskb	%xmm6, %r8d
> -	pmovmskb	%xmm0, %ecx
> -	salq	$32, %r9
> +	pmovmskb %xmm0, %ecx
> +	pmovmskb %xmm3, %r9d
> +	salq	$48, %rcx
>  	salq	$32, %r8
> -	pcmpeqb	%xmm1, %xmm2
> +	pcmpeqb	%xmm1, %xmm4
>  	orq	%r8, %rdx
> -	salq	$48, %rcx
> -	pmovmskb	%xmm5, %r8d
> +	pcmpeqb	%xmm1, %xmm2
>  	orq	%rsi, %rdx
> -	pmovmskb	%xmm4, %esi
> +	pmovmskb %xmm4, %esi
>  	orq	%rcx, %rdx
> -	pmovmskb	%xmm2, %ecx
> +	pmovmskb %xmm2, %ecx
> +	pcmpeqb	%xmm1, %xmm5
>  	salq	$16, %rsi
> -	salq	$48, %rcx
> +	pmovmskb %xmm5, %r8d
> +	salq	$32, %r9
>  	orq	%r9, %rsi
>  	orq	%r8, %rsi
> +	salq	$48, %rcx
>  	orq	%rcx, %rsi
>  	movl	%edi, %ecx
>  	subl	%eax, %ecx
> @@ -226,7 +226,7 @@ L(cross_page):
>  	je	L(exit)
>  	bsrq	%rsi, %rax
>  	addq	%rdi, %rax
> -	ret
> +	ret
>  END (strrchr)
>  
>  weak_alias (strrchr, rindex)
> -- 
> 1.8.4.rc3
> 
>  

-- 

broadcast packets on wrong frequency


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]