This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1.5/2] Fix strrchr regression.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Tue, 6 Aug 2013 10:34:30 +0200
- Subject: Re: [PATCH 1.5/2] Fix strrchr regression.
- References: <20130805173346 dot GA4978 at domone dot kolej dot mff dot cuni dot cz>
On Mon, Aug 05, 2013 at 07:33:46PM +0200, OndÅej BÃlka wrote:
> Hi,
>
...
> To get reliable results I added strrchr to my profiler. You can clearly
> see asymptotic behavior of strrchr here.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html
>
> Actually there are two regressions here. First one is that strchr_sse42
> is also about 14% asymptotically slower in sandy/ivy bridge and around
> 3% slower for fx10.
>
I added optimized strrchr version that decects end by trick from strlen and
tested by my benchmark. It gives another 10% for all architectures
except old athlon over __strrchr_no_bsf.
I do not know yet how write optimal header. It looks that strchr_sse2
simpler header is faster on result_gcc workload. However I need more
data as strrchr there is used sparingly.
Results at
http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html
are updated and benchmark program is here:
http://kam.mff.cuni.cz/~ondra/strrchr_profile060813.tar.bz2
My new variant is following (ask if you do want a template):
.file "strrchr.c"
.text
.p2align 4,,15
.globl strrchr_new
.type strrchr_new, @function
strrchr_new:
.LFB521:
.cfi_startproc
movd %esi, %xmm1
movq %rdi, %rax
andl $4095, %eax
punpcklbw %xmm1, %xmm1
cmpq $4032, %rax
punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
ja .L2
movdqu (%rdi), %xmm0
pxor %xmm2, %xmm2
movdqa %xmm0, %xmm3
pcmpeqb %xmm1, %xmm0
pcmpeqb %xmm2, %xmm3
pmovmskb %xmm0, %ecx
pmovmskb %xmm3, %edx
testq %rdx, %rdx
je .L3
leaq -1(%rdx), %rax
xorq %rdx, %rax
andq %rcx, %rax
je .L4
#APP
# 47 "strrchr.c" 1
bsrq %rax, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.p2align 4,,10
.p2align 3
.L2:
movq %rdi, %rcx
pxor %xmm0, %xmm0
andq $-64, %rcx
movdqu (%rcx), %xmm5
movdqa %xmm5, %xmm6
movdqu 16(%rcx), %xmm4
pcmpeqb %xmm1, %xmm5
pcmpeqb %xmm0, %xmm6
movdqu 32(%rcx), %xmm3
pmovmskb %xmm6, %esi
movdqa %xmm4, %xmm6
movdqu 48(%rcx), %xmm2
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm0, %xmm6
pmovmskb %xmm6, %eax
movdqa %xmm3, %xmm6
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm0, %xmm6
cltq
pcmpeqb %xmm2, %xmm0
salq $16, %rax
pmovmskb %xmm3, %r9d
pmovmskb %xmm6, %r8d
pmovmskb %xmm0, %edx
salq $32, %r9
salq $32, %r8
pcmpeqb %xmm1, %xmm2
orq %r8, %rax
salq $48, %rdx
pmovmskb %xmm5, %r8d
orq %rsi, %rax
pmovmskb %xmm2, %esi
orq %rdx, %rax
pmovmskb %xmm4, %edx
salq $48, %rsi
salq $16, %rdx
orq %r9, %rdx
orq %r8, %rdx
orq %rsi, %rdx
movl %edi, %esi
subl %ecx, %esi
movl %esi, %ecx
shrq %cl, %rax
shrq %cl, %rdx
testq %rax, %rax
je .L6
leaq -1(%rax), %rcx
xorq %rax, %rcx
andq %rcx, %rdx
je .L4
.L33:
#APP
# 47 "strrchr.c" 1
bsrq %rdx, %rdx
# 0 "" 2
#NO_APP
leaq (%rdi,%rdx), %rax
ret
.p2align 4,,10
.p2align 3
.L6:
xorl %eax, %eax
testq %rdx, %rdx
jne .L35
.L8:
addq $64, %rdi
pxor %xmm7, %xmm7
andq $-64, %rdi
jmp .L13
.p2align 4,,10
.p2align 3
.L11:
testq %rdx, %rdx
je .L12
#APP
# 47 "strrchr.c" 1
bsrq %rdx, %rdx
# 0 "" 2
#NO_APP
leaq (%rdi,%rdx), %rax
.L12:
addq $64, %rdi
.L13:
movdqa 32(%rdi), %xmm3
pxor %xmm6, %xmm6
movdqa 48(%rdi), %xmm2
movdqa %xmm3, %xmm0
movdqa 16(%rdi), %xmm4
pminub %xmm2, %xmm0
movdqa (%rdi), %xmm5
pminub %xmm4, %xmm0
pminub %xmm5, %xmm0
pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %ecx
movdqa %xmm5, %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %r8d
movdqa %xmm4, %xmm0
pcmpeqb %xmm1, %xmm0
pmovmskb %xmm0, %edx
movdqa %xmm3, %xmm0
pcmpeqb %xmm1, %xmm0
salq $16, %rdx
pmovmskb %xmm0, %r9d
movdqa %xmm2, %xmm0
pcmpeqb %xmm1, %xmm0
salq $32, %r9
orq %r9, %rdx
pmovmskb %xmm0, %esi
orq %r8, %rdx
salq $48, %rsi
orq %rsi, %rdx
testl %ecx, %ecx
je .L11
pcmpeqb %xmm6, %xmm4
pcmpeqb %xmm6, %xmm3
pcmpeqb %xmm6, %xmm5
pmovmskb %xmm4, %ecx
pmovmskb %xmm3, %r9d
pcmpeqb %xmm6, %xmm2
pmovmskb %xmm5, %r8d
salq $16, %rcx
salq $32, %r9
pmovmskb %xmm2, %esi
orq %r9, %rcx
orq %r8, %rcx
salq $48, %rsi
orq %rsi, %rcx
leaq -1(%rcx), %rsi
xorq %rcx, %rsi
andq %rsi, %rdx
jne .L33
rep
ret
.p2align 4,,10
.p2align 3
.L3:
movdqu 16(%rdi), %xmm4
movdqa %xmm4, %xmm5
movdqu 32(%rdi), %xmm3
pcmpeqb %xmm1, %xmm4
pcmpeqb %xmm2, %xmm5
movdqu 48(%rdi), %xmm0
pmovmskb %xmm5, %edx
movdqa %xmm3, %xmm5
pcmpeqb %xmm1, %xmm3
pcmpeqb %xmm2, %xmm5
salq $16, %rdx
pmovmskb %xmm3, %r8d
pmovmskb %xmm5, %eax
pcmpeqb %xmm0, %xmm2
salq $32, %r8
salq $32, %rax
pcmpeqb %xmm1, %xmm0
orq %rdx, %rax
pmovmskb %xmm4, %edx
pmovmskb %xmm2, %esi
salq $48, %rsi
salq $16, %rdx
orq %r8, %rdx
orq %rcx, %rdx
pmovmskb %xmm0, %ecx
salq $48, %rcx
orq %rcx, %rdx
orq %rsi, %rax
je .L6
leaq -1(%rax), %rcx
xorq %rax, %rcx
andq %rcx, %rdx
jne .L36
.p2align 4,,10
.p2align 3
.L4:
xorl %eax, %eax
ret
.p2align 4,,10
.p2align 3
.L35:
#APP
# 47 "strrchr.c" 1
bsrq %rdx, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
jmp .L8
.p2align 4,,10
.p2align 3
.L36:
#APP
# 47 "strrchr.c" 1
bsrq %rdx, %rax
# 0 "" 2
#NO_APP
addq %rdi, %rax
ret
.cfi_endproc
.LFE521:
.size strrchr_new, .-strrchr_new
.ident "GCC: (Debian 4.7.1-2) 4.7.1"
.section .note.GNU-stack,"",@progbits