This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v1.1] Optimize strrchr more.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 5 Oct 2013 08:39:32 +0200
- Subject: [PATCH v1.1] Optimize strrchr more.
- Authentication-results: sourceware.org; auth=none
- References: <20131004201522 dot GA6269 at domone>
On Fri, Oct 04, 2013 at 10:15:22PM +0200, OndÅej BÃlka wrote:
> Hi,
>
> I played with my evolutionary algorithms to optimize various functions.
> It helped a bit but I got more by looking at code again because I
> noticed several oppurtunities that I missed.
>
> First one is that best way to test if bytes are zero is by preparing
> zero register in advance, assume you have code
>
> pxor %xmm3, %xmm3
> movdqa %xmm1, %xmm2
> pcmpeqb %xmm3, %xmm1 /*get zero mask */
> # do something with xmm2
>
> Could be changed into
>
> pxor %xmm1, %xmm1
> pcmpeqb %xmm2, %xmm1 /*get zero mask */
> # do something with xmm2
>
> Second improvement is using 32-byte registers where benefical.
>
> And last is that in previous iteration I did not trougthruly checked a loop
> generated by gcc. A cse pass added four extra moves that are useful only
> after exiting from loop. By removing these we on most architectures gained
> around 10% for large inputs.
>
> Then there is evolver itself which rearanges scheduling to faster one
> but its effect is relatively small (1%-2%).
>
> OK to commit?
>
I introduced typo on manual editation, I tested wrong wrong register at
exit from loop,
Here is fixed version.
* sysdeps/x86_64/strrchr.S: Optimize implementation.
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 514765b..d532206 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -26,61 +26,72 @@
.text
ENTRY (strrchr)
+ pxor %xmm6, %xmm6
movd %esi, %xmm1
+ pxor %xmm7, %xmm7
movq %rdi, %rax
+#ifdef USE_SSSE3
andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
+ pxor %xmm5, %xmm5
+ pxor %xmm4, %xmm4
+ cmp $4032, %eax
+ pshufb %xmm7, %xmm1
+#else
+ punpcklbw %xmm1, %xmm1
+ andl $4095, %eax
+ pxor %xmm5, %xmm5
+ pxor %xmm4, %xmm4
+ cmp $4032, %eax
+ punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
+#endif
ja L(cross_page)
movdqu (%rdi), %xmm0
- pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
+ pcmpeqb %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
+ pmovmskb %xmm0, %ecx
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
+ lea -1(%edx), %eax
+ xor %edx, %eax
+ and %ecx, %eax
je L(exit)
- bsrq %rax, %rax
+ bsr %eax, %eax
addq %rdi, %rax
- ret
+ ret
+
+L(exit):
+ xorl %eax, %eax
+ ret
- ALIGN(4)
+ .p2align 4
L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
+ movdqu 48(%rdi), %xmm4
+ pcmpeqb %xmm4, %xmm7
+ movdqu 16(%rdi), %xmm2
pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
+ pmovmskb %xmm5, %eax
+ pcmpeqb %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %esi
+ salq $16, %rsi
pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %edx
+ orq %rcx, %rsi
+ pmovmskb %xmm6, %ecx
+ pmovmskb %xmm3, %r8d
+ salq $16, %rax
+ salq $32, %rcx
salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
salq $48, %rdx
- salq $16, %rsi
+ orq %rcx, %rax
orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
+ orq %rdx, %rsi
+ pmovmskb %xmm7, %edx
+ salq $48, %rdx
orq %rdx, %rax
je L(loop_header2)
leaq -1(%rax), %rcx
@@ -88,71 +99,69 @@ L(next_48_bytes):
andq %rcx, %rsi
je L(exit)
bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
- ret
+ leaq (%rdi, %rsi), %rax
+ ret
- ALIGN(4)
+ .p2align 3
L(loop_header2):
testq %rsi, %rsi
movq %rdi, %rcx
- je L(no_c_found)
+ jne L(loop_header)
+ movl $1, %esi /* Evaluates to null. */
+ xorl %ecx, %ecx
L(loop_header):
addq $64, %rdi
pxor %xmm7, %xmm7
andq $-64, %rdi
jmp L(loop_entry)
- ALIGN(4)
+ .p2align 3
L(loop64):
testq %rdx, %rdx
cmovne %rdx, %rsi
cmovne %rdi, %rcx
addq $64, %rdi
L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
+ movdqa 16(%rdi), %xmm3
+ movdqa 48(%rdi), %xmm5
+ movdqa (%rdi), %xmm2
+ movdqa %xmm2, %xmm6
+ pminub %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ movdqa 32(%rdi), %xmm4
+ pminub %xmm4, %xmm6
+ pminub %xmm5, %xmm6
+ pmovmskb %xmm3, %edx
+ pcmpeqb %xmm7, %xmm6
+ pmovmskb %xmm6, %eax
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %r10d
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm5, %r8d
salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
salq $32, %r10
orq %r10, %rdx
- pmovmskb %xmm0, %r8d
+ pmovmskb %xmm2, %r9d
orq %r9, %rdx
salq $48, %r8
orq %r8, %rdx
testl %eax, %eax
je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
+ movdqa 32(%rdi), %xmm4
+ salq $48, %rax
+ pcmpeqb %xmm7, %xmm4
+ movdqa 16(%rdi), %xmm3
+ pmovmskb %xmm4, %r10d
+ pcmpeqb %xmm7, %xmm3
+ movdqa (%rdi), %xmm2
+ pmovmskb %xmm3, %r9d
salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
+ pcmpeqb %xmm7, %xmm2
+ pmovmskb %xmm2, %r8d
orq %r10, %rax
+ salq $16, %r9
orq %r9, %rax
- salq $48, %r8
orq %r8, %rax
leaq -1(%rax), %r8
xorq %rax, %r8
@@ -160,59 +169,50 @@ L(loop_entry):
cmovne %rdi, %rcx
cmovne %rdx, %rsi
bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
- ret
+ leaq (%rcx, %rsi), %rax
+ ret
- ALIGN(4)
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
- ALIGN(4)
-L(exit):
- xorl %eax, %eax
- ret
- ALIGN(4)
+ .p2align 2
L(cross_page):
- movq %rdi, %rax
pxor %xmm0, %xmm0
+ movq %rdi, %rax
andq $-64, %rax
+ movdqu 48(%rax), %xmm2
movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
+ movdqa %xmm5, %xmm6
pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
+ pmovmskb %xmm6, %esi
movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
+ movdqu 32(%rax), %xmm3
pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
+ pmovmskb %xmm6, %edx
movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
+ salq $16, %rdx
pcmpeqb %xmm0, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm6, %r8d
pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
+ pmovmskb %xmm0, %ecx
+ pmovmskb %xmm3, %r9d
+ salq $48, %rcx
salq $32, %r8
- pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm4
orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
+ pcmpeqb %xmm1, %xmm2
orq %rsi, %rdx
- pmovmskb %xmm4, %esi
+ pmovmskb %xmm4, %esi
orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
+ pmovmskb %xmm2, %ecx
+ pcmpeqb %xmm1, %xmm5
salq $16, %rsi
- salq $48, %rcx
+ pmovmskb %xmm5, %r8d
+ salq $32, %r9
orq %r9, %rsi
orq %r8, %rsi
+ salq $48, %rcx
orq %rcx, %rsi
movl %edi, %ecx
subl %eax, %ecx
@@ -226,7 +226,7 @@ L(cross_page):
je L(exit)
bsrq %rsi, %rax
addq %rdi, %rax
- ret
+ ret
END (strrchr)
weak_alias (strrchr, rindex)
--
1.8.4.rc3