This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Optimize strchr more.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Fri, 4 Oct 2013 23:10:10 +0200
- Subject: [PATCH] Optimize strchr more.
- Authentication-results: sourceware.org; auth=none
Hi,
I improved strchr in similar way as strrchr. There is one difference
that as ifunc is available we could use pshufb to broadcast byte. This
gives us few cycles almost for free.
I did same optimizations as strrchr with one more. In loop check for 4
16-byte blocks if they match and calculate a mask for it. As x86-64 has
destructive operations it looked faster to recalculate one block
otherwise moves would slow us down like here.
int m0,m1,m2,m3;
if (m0|m1|m2|m3)
{
/* recalculate m0
mask=m0|m1<<16|m2<<32|m3<<48;
...
But this is not neccesary when we are interested only in first nonzero bit.
We can use combined mask at end and it will come in action only when
m0,m1,m2 are zero so m3 has correct value as in following pseudocode:
int m0,m1,m2,m3;
if (m0|m1|m2|m3)
{
mask=m0|m1<<16|m2<<32|(m0|m1|m2|m3)<<48;
...
These modifications give us around 4%. benchmarks are here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
Add strchr-ssse3.
* sysdeps/x86_64/multiarch/strchr-ssse3.S: New file.
* sysdeps/x86_64/multiarch/strchr.S: Add ifunc.
* sysdeps/x86_64/strchr.S: Optimize implementation.
---
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/strchr-ssse3.S | 3 +
sysdeps/x86_64/multiarch/strchr.S | 4 +
sysdeps/x86_64/strchr.S | 228 ++++++++++++++++----------------
4 files changed, 122 insertions(+), 115 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strchr-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f910fb2..042f501 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3
+ strchr-sse2-no-bsf strchr-ssse3 memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strchr-ssse3.S b/sysdeps/x86_64/multiarch/strchr-ssse3.S
new file mode 100644
index 0000000..6d09015
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_SSSE3
+#define strchr __strchr_ssse3
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index 3f0b2c5..2b67ca4 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,6 +29,10 @@ ENTRY(strchr)
jne 1f
call __init_cpu_features
1: leaq __strchr_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ je 2f
+ leaq __strchr_ssse3(%rip), %rax
+ ret
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jz 3f
leaq __strchr_sse2_no_bsf(%rip), %rax
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index 1900b37..0bb45d2 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,174 +19,174 @@
#include <sysdep.h>
-# ifndef ALIGN
-# define ALIGN(n) .p2align n
-# endif
-
-
.text
ENTRY (strchr)
movd %esi, %xmm1
movl %edi, %eax
+ pxor %xmm4, %xmm4
andl $4095, %eax
+ pxor %xmm5, %xmm5
+#ifdef USE_SSSE3
+ pshufb %xmm4, %xmm1
+#else
punpcklbw %xmm1, %xmm1
- cmpl $4032, %eax
punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
+#endif
+ cmpl $4032, %eax
jg L(cross_page)
movdqu (%rdi), %xmm0
- pxor %xmm3, %xmm3
- movdqa %xmm0, %xmm4
+ pcmpeqb %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm3, %xmm4
por %xmm4, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
je L(next_48_bytes)
- bsf %eax, %eax
+ bsf %eax, %eax #| bsf %ax, %ax #| bsf %rax, %rax
#ifdef AS_STRCHRNUL
- leaq (%rdi,%rax), %rax
+ addq %rdi, %rax
#else
- movl $0, %edx
- leaq (%rdi,%rax), %rax
+ xor %edx, %edx
+ addq %rdi, %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
#endif
ret
- ALIGN(3)
- L(next_48_bytes):
+ .p2align 3
+L(next_48_bytes):
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ movdqu 48(%rdi), %xmm3
+ movdqu 32(%rdi), %xmm2
+ pcmpeqb %xmm3, %xmm7
movdqu 16(%rdi), %xmm0
- movdqa %xmm0, %xmm4
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm3, %xmm4
- por %xmm4, %xmm0
- pmovmskb %xmm0, %ecx
- movdqu 32(%rdi), %xmm0
- movdqa %xmm0, %xmm4
+ movq %rdi, %r8
+ pcmpeqb %xmm1, %xmm3
+ andq $-64, %rdi
+ por %xmm7, %xmm3
+ pcmpeqb %xmm2, %xmm6
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm0, %xmm5
pcmpeqb %xmm1, %xmm0
- salq $16, %rcx
- pcmpeqb %xmm3, %xmm4
- por %xmm4, %xmm0
+ por %xmm6, %xmm2
+ pmovmskb %xmm3, %edx
+ pmovmskb %xmm2, %ecx
+ shlq $32, %rdx
+ shlq $16, %rcx
+ por %xmm5, %xmm0
pmovmskb %xmm0, %eax
- movdqu 48(%rdi), %xmm0
- pcmpeqb %xmm0, %xmm3
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rcx, %rax
- por %xmm3, %xmm0
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
orq %rcx, %rax
- testq %rax, %rax
- jne L(return)
-L(loop_start):
- /* We use this alignment to force loop be aligned to 8 but not
- 16 bytes. This gives better sheduling on AMD processors. */
- ALIGN(4)
pxor %xmm6, %xmm6
- andq $-64, %rdi
- ALIGN(3)
+ orq %rdx, %rax
+ je L(loop64)
+ bsfq %rax, %rax
+#ifdef AS_STRCHRNUL
+ leaq 16(%r8, %rax), %rax
+#else
+ xor %edx, %edx
+ leaq 16(%r8, %rax), %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
+
+ .p2align 4
L(loop64):
- addq $64, %rdi
- movdqa (%rdi), %xmm5
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
+ movdqa 96(%rdi), %xmm3
+ movdqa 64(%rdi), %xmm5
pxor %xmm1, %xmm5
- movdqa 48(%rdi), %xmm4
- pxor %xmm1, %xmm2
- pxor %xmm1, %xmm3
- pminub (%rdi), %xmm5
+ movdqa 112(%rdi), %xmm4
+ movdqa 80(%rdi), %xmm2
pxor %xmm1, %xmm4
- pminub 16(%rdi), %xmm2
- pminub 32(%rdi), %xmm3
- pminub %xmm2, %xmm5
- pminub 48(%rdi), %xmm4
- pminub %xmm3, %xmm5
- pminub %xmm4, %xmm5
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm5, %eax
-
- testl %eax, %eax
+ pminub 112(%rdi), %xmm4
+ pminub 64(%rdi), %xmm5
+ pxor %xmm1, %xmm3
+ pxor %xmm1, %xmm2
+ pminub %xmm5, %xmm4
+ pminub 96(%rdi), %xmm3
+ pminub 80(%rdi), %xmm2
+ addq $64, %rdi
+ pminub %xmm3, %xmm4
+ pminub %xmm2, %xmm4
+ pcmpeqb %xmm6, %xmm4
+ pmovmskb %xmm4, %edx
+ testl %edx, %edx
je L(loop64)
-
- movdqa (%rdi), %xmm5
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm6, %xmm0
- por %xmm0, %xmm5
- pcmpeqb %xmm6, %xmm2
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm4
-
- pmovmskb %xmm5, %ecx
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %r8d
+ pcmpeqb %xmm6, %xmm2
+ pcmpeqb %xmm6, %xmm5
pmovmskb %xmm2, %eax
+ pmovmskb %xmm5, %ecx
salq $16, %rax
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm4, %edx
salq $32, %r8
+ salq $48, %rdx
orq %r8, %rax
orq %rcx, %rax
- salq $48, %rdx
orq %rdx, %rax
- ALIGN(3)
-L(return):
bsfq %rax, %rax
#ifdef AS_STRCHRNUL
- leaq (%rdi,%rax), %rax
+ leaq (%rdi, %rax), %rax
#else
- movl $0, %edx
- leaq (%rdi,%rax), %rax
+ xor %edx, %edx
+ leaq (%rdi, %rax), %rax
cmpb %sil, (%rax)
cmovne %rdx, %rax
#endif
ret
- ALIGN(4)
+ .p2align 4
L(cross_page):
- movq %rdi, %rdx
- pxor %xmm2, %xmm2
- andq $-64, %rdx
- movdqa %xmm1, %xmm0
- movdqa (%rdx), %xmm3
- movdqa %xmm3, %xmm4
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
- pmovmskb %xmm3, %r8d
- movdqa 16(%rdx), %xmm3
- movdqa %xmm3, %xmm4
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
- pmovmskb %xmm3, %eax
- movdqa 32(%rdx), %xmm3
- movdqa %xmm3, %xmm4
+ pxor %xmm7, %xmm7
+ movq %rdi, %rcx
+ andq $-64, %rdi
+ movdqu 48(%rdi), %xmm3
+ movdqu (%rdi), %xmm8
+ movdqu 32(%rdi), %xmm2
+ pxor %xmm6, %xmm6
+ pcmpeqb %xmm8, %xmm4
+ movdqu 16(%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm8
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm1, %xmm0
+ por %xmm5, %xmm0
+ por %xmm4, %xmm8
+ pcmpeqb %xmm2, %xmm6
+ pcmpeqb %xmm1, %xmm2
+ por %xmm6, %xmm2
+ pmovmskb %xmm8, %edx
+ pcmpeqb %xmm3, %xmm7
pcmpeqb %xmm1, %xmm3
- salq $16, %rax
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
+ pmovmskb %xmm0, %eax
+ pmovmskb %xmm2, %r8d
+ por %xmm7, %xmm3
pmovmskb %xmm3, %r9d
- movdqa 48(%rdx), %xmm3
- pcmpeqb %xmm3, %xmm2
- salq $32, %r9
- pcmpeqb %xmm3, %xmm0
- orq %r9, %rax
+ shlq $48, %r9
+ shlq $32, %r8
+ shlq $16, %rax
+ orq %rdx, %rax
orq %r8, %rax
- por %xmm2, %xmm0
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rax
- movl %edi, %ecx
- subb %dl, %cl
+ orq %r9, %rax
+ pxor %xmm6, %xmm6
shrq %cl, %rax
testq %rax, %rax
- jne L(return)
- jmp L(loop_start)
-
+ je L(loop64)
+ bsfq %rax, %rax
+#ifdef AS_STRCHRNUL
+ addq %rcx, %rax
+#else
+ xor %edx, %edx
+ addq %rcx, %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
END (strchr)
#ifndef AS_STRCHRNUL
+# ifndef USE_SSSE3
weak_alias (strchr, index)
libc_hidden_builtin_def (strchr)
+# endif
#endif
--
1.8.4.rc3