This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH v1.1] Improve rawmemchr implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 2 Sep 2013 11:55:30 +0200
- Subject: [PING][PATCH v1.1] Improve rawmemchr implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20130816120314 dot GA25879 at domone dot kolej dot mff dot cuni dot cz> <20130816121256 dot GA26328 at domone dot kolej dot mff dot cuni dot cz>
Ping,
I noticed that we use strong alias instead of weak, is that intentional?
strong_alias (rawmemchr, __rawmemchr)
There is one minor change, I misclassified case s % 4096 == 4032 as
crossing page boundary. Following should improve performance a bit.
- cmpl $4031, %eax
+ cmpl $4032, %eax
On Fri, Aug 16, 2013 at 02:12:56PM +0200, OndÅej BÃlka wrote:
> A patch got accidentaly filtered, so here is correct version.
>
> Hi,
>
> I looked to rawmemchr implementation and it can be improved by using
> similar header that is used in strchr. A loop itself was well optimized
> so we only gain around 20 cycles per call for sizes from 64 bytes.
>
> Results at show it that this is improvement for unit tests but I did not
> find program that calls rawmemchr yet.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile.html
> A benchmark is at
> http://kam.mff.cuni.cz/~ondra/benchmark_string/rawmemchr_profile160813.tar.bz2
>
> Passes test, OK to commit?
>
* sysdeps/x86_64/rawmemchr.S (rawmemchr): Optimize implementation.
---
sysdeps/x86_64/rawmemchr.S | 252 +++++++++++++++------------------------------
1 file changed, 85 insertions(+), 167 deletions(-)
diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
index f4d5591..93b8f4f 100644
--- a/sysdeps/x86_64/rawmemchr.S
+++ b/sysdeps/x86_64/rawmemchr.S
@@ -22,185 +22,103 @@
.text
ENTRY (rawmemchr)
- movd %rsi, %xmm1
- mov %rdi, %rcx
-
- punpcklbw %xmm1, %xmm1
- punpcklbw %xmm1, %xmm1
-
- and $63, %rcx
+ movd %esi, %xmm1
+ movq %rdi, %rax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpl $4032, %eax
+ punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
-
- cmp $48, %rcx
- ja L(crosscache)
-
+ jg L(cross_page)
movdqu (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
- test %eax, %eax
-
- jnz L(matches)
- add $16, %rdi
- and $-16, %rdi
- jmp L(loop_prolog)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- movdqa (%rdi), %xmm0
-
- pcmpeqb %xmm1, %xmm0
-/* Check if there is a match. */
- pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
- sar %cl, %eax
+ pmovmskb %xmm0, %eax
test %eax, %eax
- je L(unaligned_no_match)
-/* Check which byte is a match. */
- bsf %eax, %eax
-
- add %rdi, %rax
- add %rcx, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- add $16, %rdi
-
- .p2align 4
-L(loop_prolog):
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
+ jne L(finish)
+ movdqu 16(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm2
pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
- add $64, %rdi
- pmovmskb %xmm4, %eax
- test %eax, %eax
- jnz L(matches0)
-
- test $0x3f, %rdi
- jz L(align64_loop)
-
- movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- movdqa 16(%rdi), %xmm2
+ movdqu 48(%rdi), %xmm0
pcmpeqb %xmm1, %xmm2
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- movdqa 48(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
- pmovmskb %xmm3, %eax
-
- add $64, %rdi
- test %eax, %eax
- jnz L(matches0)
-
- and $-64, %rdi
-
- .p2align 4
-L(align64_loop):
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- movdqa 48(%rdi), %xmm4
-
+ pmovmskb %xmm3, %edx
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm2, %eax
+ pmovmskb %xmm0, %ecx
+ salq $16, %rdx
+ salq $32, %rax
+ orq %rdx, %rax
+ movq %rcx, %rdx
+ salq $48, %rdx
+ orq %rdx, %rax
+ jne L(finish)
+L(align):
+ andq $-64, %rdi
+ .p2align 4
+L(loop64):
+ movdqa 64(%rdi), %xmm5
+ movdqa 80(%rdi), %xmm4
+ pcmpeqb %xmm1, %xmm5
+ movdqa 96(%rdi), %xmm3
pcmpeqb %xmm1, %xmm4
-
- pmaxub %xmm0, %xmm3
- pmaxub %xmm2, %xmm4
- pmaxub %xmm3, %xmm4
- pmovmskb %xmm4, %eax
-
- add $64, %rdi
-
- test %eax, %eax
- jz L(align64_loop)
-
- sub $64, %rdi
-
- pmovmskb %xmm0, %eax
- test %eax, %eax
- jnz L(matches)
-
- pmovmskb %xmm2, %eax
- test %eax, %eax
- jnz L(matches16)
-
- movdqa 32(%rdi), %xmm3
+ movdqa 112(%rdi), %xmm2
pcmpeqb %xmm1, %xmm3
-
- pcmpeqb 48(%rdi), %xmm1
- pmovmskb %xmm3, %eax
- test %eax, %eax
- jnz L(matches32)
-
- pmovmskb %xmm1, %eax
- bsf %eax, %eax
- lea 48(%rdi, %rax), %rax
- ret
-
- .p2align 4
-L(matches0):
- bsf %eax, %eax
- lea -16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches):
- bsf %eax, %eax
- add %rdi, %rax
- ret
-
- .p2align 4
-L(matches16):
- bsf %eax, %eax
- lea 16(%rax, %rdi), %rax
- ret
-
- .p2align 4
-L(matches32):
- bsf %eax, %eax
- lea 32(%rax, %rdi), %rax
+ pmaxub %xmm4, %xmm5
+ pcmpeqb %xmm1, %xmm2
+ pmaxub %xmm3, %xmm5
+ pmaxub %xmm2, %xmm5
+ addq $64, %rdi
+ pmovmskb %xmm5, %eax
+ testl %eax, %eax
+ je L(loop64)
+
+ movdqa (%rdi), %xmm5
+ pcmpeqb %xmm1, %xmm5
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm3, %esi
+ pmovmskb %xmm2, %edx
+ salq $32, %rsi
+ salq $16, %rax
+ salq $48, %rdx
+ orq %rsi, %rax
+ orq %rcx, %rax
+ orq %rdx, %rax
+L(finish):
+ bsfq %rax, %rax
+ addq %rdi, %rax
ret
+ .p2align 4,,10
+ .p2align 3
+L(cross_page):
- .p2align 4
-L(return_null):
- xor %rax, %rax
+ movq %rdi, %rax
+ andq $-64, %rax
+ movdqa (%rax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %esi
+ movdqa 16(%rax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqa 32(%rax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ salq $16, %rdx
+ pmovmskb %xmm0, %r8d
+ movdqa 48(%rax), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ salq $32, %r8
+ orq %r8, %rdx
+ pmovmskb %xmm0, %ecx
+ orq %rsi, %rdx
+ salq $48, %rcx
+ orq %rcx, %rdx
+ movl %edi, %ecx
+ subl %eax, %ecx
+ shrq %cl, %rdx
+ testq %rdx, %rdx
+ je L(align)
+ bsfq %rdx, %rax
+ addq %rdi, %rax
ret
-
END (rawmemchr)
strong_alias (rawmemchr, __rawmemchr)