This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Optimize strchr more.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Fri, 4 Oct 2013 23:10:10 +0200
Subject: [PATCH] Optimize strchr more.
Authentication-results: sourceware.org; auth=none
Hi,
I improved strchr in similar way as strrchr. There is one difference
that as ifunc is available we could use pshufb to broadcast byte. This
gives us few cycles almost for free.

I did same optimizations as strrchr with one more. In loop check for 4
16-byte blocks if they match and calculate a mask for it. As x86-64 has
destructive operations it looked faster to recalculate one block
otherwise moves would slow us down like here.

int m0,m1,m2,m3;
if (m0|m1|m2|m3)
  {
    /* recalculate m0
    mask=m0|m1<<16|m2<<32|m3<<48;
    ...

But this is not neccesary when we are interested only in first nonzero bit.
We can use combined mask at end and it will come in action only when
m0,m1,m2 are zero so m3 has correct value as in following pseudocode:

int m0,m1,m2,m3;
if (m0|m1|m2|m3)
  {
    mask=m0|m1<<16|m2<<32|(m0|m1|m2|m3)<<48;
    ...

These modifications give us around 4%. benchmarks are here:

http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html

	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
	Add strchr-ssse3.
	* sysdeps/x86_64/multiarch/strchr-ssse3.S: New file.
	* sysdeps/x86_64/multiarch/strchr.S: Add ifunc.
	* sysdeps/x86_64/strchr.S: Optimize implementation.

---
 sysdeps/x86_64/multiarch/Makefile       |   2 +-
 sysdeps/x86_64/multiarch/strchr-ssse3.S |   3 +
 sysdeps/x86_64/multiarch/strchr.S       |   4 +
 sysdeps/x86_64/strchr.S                 | 228 ++++++++++++++++----------------
 4 files changed, 122 insertions(+), 115 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f910fb2..042f501 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3
+		   strchr-sse2-no-bsf strchr-ssse3 memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strchr-ssse3.S b/sysdeps/x86_64/multiarch/strchr-ssse3.S
new file mode 100644
index 0000000..6d09015
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_SSSE3
+#define strchr __strchr_ssse3
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index 3f0b2c5..2b67ca4 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,6 +29,10 @@ ENTRY(strchr)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__strchr_sse2(%rip), %rax
+	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	je	2f
+	leaq    __strchr_ssse3(%rip), %rax
+	ret
 2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index 1900b37..0bb45d2 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,174 +19,174 @@
 
 #include <sysdep.h>
 
-# ifndef ALIGN
-#  define ALIGN(n)	.p2align n
-# endif
-
-
 	.text
 ENTRY (strchr)
 	movd	%esi, %xmm1
 	movl	%edi, %eax
+	pxor	%xmm4, %xmm4
 	andl	$4095, %eax
+	pxor	%xmm5, %xmm5
+#ifdef USE_SSSE3
+	pshufb	%xmm4, %xmm1
+#else
 	punpcklbw %xmm1, %xmm1
-	cmpl	$4032, %eax
 	punpcklwd %xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
+#endif
+	cmpl	$4032, %eax
 	jg	L(cross_page)
 	movdqu	(%rdi), %xmm0
-	pxor	%xmm3, %xmm3
-	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
 	por	%xmm4, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	je	L(next_48_bytes)
-	bsf	%eax, %eax
+	bsf	%eax, %eax #| bsf %ax, %ax #| bsf %rax, %rax
 #ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
+	addq	%rdi, %rax
 #else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
+	xor	%edx, %edx
+	addq	%rdi, %rax
 	cmpb	%sil, (%rax)
 	cmovne	%rdx, %rax
 #endif
 	ret
 
-	ALIGN(3)
-	L(next_48_bytes):
+	.p2align 3
+L(next_48_bytes):
+	pxor	%xmm6, %xmm6
+	pxor	%xmm7, %xmm7
+	movdqu	48(%rdi), %xmm3
+	movdqu	32(%rdi), %xmm2
+	pcmpeqb	%xmm3, %xmm7
 	movdqu	16(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %ecx
-	movdqu	32(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
+	movq	%rdi, %r8
+	pcmpeqb	%xmm1, %xmm3
+	andq	$-64, %rdi
+	por	%xmm7, %xmm3
+	pcmpeqb	%xmm2, %xmm6
+	pcmpeqb	%xmm1, %xmm2
+	pcmpeqb	%xmm0, %xmm5
 	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rcx
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
+	por	%xmm6, %xmm2
+	pmovmskb %xmm3, %edx
+	pmovmskb %xmm2, %ecx
+	shlq	$32, %rdx
+	shlq	$16, %rcx
+	por	%xmm5, %xmm0
 	pmovmskb %xmm0, %eax
-	movdqu	48(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rcx, %rax
-	por	%xmm3, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
 	orq	%rcx, %rax
-	testq	%rax, %rax
-	jne	L(return)
-L(loop_start):
-	/* We use this alignment to force loop be aligned to 8 but not
-	   16 bytes.  This gives better sheduling on AMD processors.  */
-	ALIGN(4)
 	pxor	%xmm6, %xmm6
-	andq	$-64, %rdi
-	ALIGN(3)
+	orq	%rdx, %rax
+	je	L(loop64)
+	bsfq	%rax, %rax
+#ifdef AS_STRCHRNUL
+	leaq	16(%r8, %rax), %rax
+#else
+	xor	%edx, %edx
+	leaq	16(%r8, %rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
+
+	.p2align 4
 L(loop64):
-	addq	$64, %rdi
-	movdqa	(%rdi), %xmm5
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
+	movdqa	96(%rdi), %xmm3
+	movdqa	64(%rdi), %xmm5
 	pxor	%xmm1, %xmm5
-	movdqa	48(%rdi), %xmm4
-	pxor	%xmm1, %xmm2
-	pxor	%xmm1, %xmm3
-	pminub	(%rdi), %xmm5
+	movdqa	112(%rdi), %xmm4
+	movdqa	80(%rdi), %xmm2
 	pxor	%xmm1, %xmm4
-	pminub	16(%rdi), %xmm2
-	pminub	32(%rdi), %xmm3
-	pminub	%xmm2, %xmm5
-	pminub	48(%rdi), %xmm4
-	pminub	%xmm3, %xmm5
-	pminub	%xmm4, %xmm5
-	pcmpeqb %xmm6, %xmm5
-	pmovmskb %xmm5, %eax
-
-	testl	%eax, %eax
+	pminub	112(%rdi), %xmm4
+	pminub	64(%rdi), %xmm5
+	pxor	%xmm1, %xmm3
+	pxor	%xmm1, %xmm2
+	pminub	%xmm5, %xmm4
+	pminub	96(%rdi), %xmm3
+	pminub	80(%rdi), %xmm2
+	addq	$64, %rdi
+	pminub	%xmm3, %xmm4
+	pminub	%xmm2, %xmm4
+	pcmpeqb	%xmm6, %xmm4
+	pmovmskb %xmm4, %edx
+	testl	%edx, %edx
 	je	L(loop64)
-
-	movdqa	(%rdi), %xmm5
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm6, %xmm0
-	por	%xmm0, %xmm5
-	pcmpeqb %xmm6, %xmm2
-	pcmpeqb %xmm6, %xmm3
-	pcmpeqb %xmm6, %xmm4
-
-	pmovmskb %xmm5, %ecx
+	pcmpeqb	%xmm6, %xmm3
+	pmovmskb %xmm3, %r8d
+	pcmpeqb	%xmm6, %xmm2
+	pcmpeqb	%xmm6, %xmm5
 	pmovmskb %xmm2, %eax
+	pmovmskb %xmm5, %ecx
 	salq	$16, %rax
-	pmovmskb %xmm3, %r8d
-	pmovmskb %xmm4, %edx
 	salq	$32, %r8
+	salq	$48, %rdx
 	orq	%r8, %rax
 	orq	%rcx, %rax
-	salq	$48, %rdx
 	orq	%rdx, %rax
-	ALIGN(3)
-L(return):
 	bsfq	%rax, %rax
 #ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
+	leaq	(%rdi, %rax), %rax
 #else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
+	xor	%edx, %edx
+	leaq	(%rdi, %rax), %rax
 	cmpb	%sil, (%rax)
 	cmovne	%rdx, %rax
 #endif
 	ret
-	ALIGN(4)
 
+	.p2align 4
 L(cross_page):
-	movq	%rdi, %rdx
-	pxor	%xmm2, %xmm2
-	andq	$-64, %rdx
-	movdqa	%xmm1, %xmm0
-	movdqa	(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r8d
-	movdqa	16(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %eax
-	movdqa	32(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
+	pxor	%xmm7, %xmm7
+	movq	%rdi, %rcx
+	andq	$-64, %rdi
+	movdqu	48(%rdi), %xmm3
+	movdqu	(%rdi), %xmm8
+	movdqu	32(%rdi), %xmm2
+	pxor	%xmm6, %xmm6
+	pcmpeqb	%xmm8, %xmm4
+	movdqu	16(%rdi), %xmm0
+	pcmpeqb	%xmm1, %xmm8
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm1, %xmm0
+	por	%xmm5, %xmm0
+	por	%xmm4, %xmm8
+	pcmpeqb	%xmm2, %xmm6
+	pcmpeqb	%xmm1, %xmm2
+	por	%xmm6, %xmm2
+	pmovmskb %xmm8, %edx
+	pcmpeqb	%xmm3, %xmm7
 	pcmpeqb	%xmm1, %xmm3
-	salq	$16, %rax
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
+	pmovmskb %xmm0, %eax
+	pmovmskb %xmm2, %r8d
+	por	%xmm7, %xmm3
 	pmovmskb %xmm3, %r9d
-	movdqa	48(%rdx), %xmm3
-	pcmpeqb	%xmm3, %xmm2
-	salq	$32, %r9
-	pcmpeqb	%xmm3, %xmm0
-	orq	%r9, %rax
+	shlq	$48, %r9
+	shlq	$32, %r8
+	shlq	$16, %rax
+	orq	%rdx, %rax
 	orq	%r8, %rax
-	por	%xmm2, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	movl	%edi, %ecx
-	subb	%dl, %cl
+	orq	%r9, %rax
+	pxor	%xmm6, %xmm6
 	shrq	%cl, %rax
 	testq	%rax, %rax
-	jne	L(return)
-	jmp	L(loop_start)
-
+	je	L(loop64)
+	bsfq	%rax, %rax
+#ifdef AS_STRCHRNUL
+	addq	%rcx, %rax
+#else
+	xor	%edx, %edx
+	addq	%rcx, %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
 END (strchr)
 
 #ifndef AS_STRCHRNUL
+# ifndef USE_SSSE3
 weak_alias (strchr, index)
 libc_hidden_builtin_def (strchr)
+# endif
 #endif
-- 
1.8.4.rc3
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]