This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp

From: Florian Weimer <fweimer at redhat dot com>
To: "Carlos O'Donell" <carlos at redhat dot com>, GNU C Library <libc-alpha at sourceware dot org>
Cc: OndÅej BÃlka <neleai at seznam dot cz>
Date: Tue, 04 Feb 2014 13:33:34 +0100
Subject: Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
Authentication-results: sourceware.org; auth=none
References: <52EBBCC2 dot 7090807 at redhat dot com> <52EBC480 dot 4000509 at redhat dot com> <52EF6E9E dot 8050708 at redhat dot com> <52F00A7D dot 5080008 at redhat dot com> <52F0DD76 dot 30900 at redhat dot com>

On 02/04/2014 01:30 PM, Florian Weimer wrote:

On 02/03/2014 10:30 PM, Carlos O'Donell wrote:

Unaligned loads overlapping with data previously compared as equal.
This data cannot affect the outcome of the final comparison.  (This
trick was already used before, but it's not entirely obvious.)

The SETNE/CMOVAE sequence for the branchless sign function.


(It turns out that this part is not actually present anymore.)

No, that looks good from a commenting perspective.


I have incorporated the comment updates.


And here's the patch.

--
Florian Weimer / Red Hat Product Security Team

2014-02-04  Florian Weimer  <fweimer@redhat.com>

	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Vectorize the comparison
	of differing bytes.  Correct spelling.  Add documentation.

diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index e753d62..10b4b69 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -36,9 +36,29 @@
 
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
+           memcmp has to use UNSIGNED comparison for elements.
 */
 
+/* This implementation uses special cases for lengths up to 79.
+   In most of them, we compute the ordering of the differing bytes
+   with a big-endian comparison in diffin8bytes and diffin4bytes.
+   This comparison does not depend on the location of the differing
+   byte in the word.
+
+   If possible, overlapping 4-byte or 8-byte loads are used to
+   minimize small-size loads and comparisons.  The bytes in the
+   overlapping part are known to be equal and thus do not affect the
+   outcome of the final comparison.
+
+   In the remaining cases, comments indicate how the bytes in the
+   input are loaded into the registers.  Bytes are labeled 1, 2, ...
+   in the order of increasing addresses, starting from the current
+   read position (which less %rdi/%rsi minus the number of remaining
+   bytes).  Resulting register contents is presented from most
+   significant byte to least.  Zero bytes are indicated with a dot
+   ".".  For example, a regular 4-byte load of the bytes 1234 will
+   result in 64-bit register contents of ...4321.  */
+
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
@@ -419,7 +439,7 @@ L(less128bytesin2aligned):
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 	cmp	$32, %rdx
-	jb	L(less32bytesin64in2alinged)
+	jb	L(less32bytesin64in2aligned)
 
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
@@ -433,7 +453,7 @@ L(less128bytesin2aligned):
 	sub	$32, %rdx
 	add	$32, %rdi
 	add	$32, %rsi
-L(less32bytesin64in2alinged):
+L(less32bytesin64in2aligned):
 	add	$64, %rdi
 	add	$64, %rsi
 	add	%rdx, %rsi
@@ -446,7 +466,7 @@ L(128bytesormorein2aligned):
 	ja	L(512bytesormorein2aligned)
 	cmp	$256, %rdx
 	ja	L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
+L(less256bytesin2aligned):
 	sub	$128, %rdx
 
 	movdqa	(%rdi), %xmm2
@@ -603,13 +623,13 @@ L(256bytesormorein2aligned):
 	add	$256, %rdi
 
 	cmp	$128, %rdx
-	jae	L(less256bytesin2alinged)
+	jae	L(less256bytesin2aligned)
 
 	cmp	$64, %rdx
 	jae	L(less128bytesin2aligned)
 
 	cmp	$32, %rdx
-	jb	L(less32bytesin256in2alinged)
+	jb	L(less32bytesin256in2aligned)
 
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
@@ -623,7 +643,7 @@ L(256bytesormorein2aligned):
 	sub	$32, %rdx
 	add	$32, %rdi
 	add	$32, %rsi
-L(less32bytesin256in2alinged):
+L(less32bytesin256in2aligned):
 	add	%rdx, %rsi
 	add	%rdx, %rdi
 	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
@@ -866,6 +886,7 @@ L(13bytes):
 	mov	-13(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -875,13 +896,18 @@ L(13bytes):
 
 	.p2align 4
 L(5bytes):
+	/* Bytes 12345 loaded as ...54321.  */
 	mov	-5(%rdi), %eax
 	mov	-5(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-	movzbl	-1(%rdi), %eax
-	movzbl	-1(%rsi), %edx
-	sub	%edx, %eax
+	movzbl	-1(%rdi), %edi
+	movzbl	-1(%rsi), %esi
+	shl	$32, %rdi
+	shl	$32, %rsi
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -916,12 +942,17 @@ L(10bytes):
 	mov	-10(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
+L(2bytes):
+	/* Bytes 12 loaded as ..12.  */
+	movzbl  -2(%rdi), %eax
+	movzbl  -2(%rsi), %ecx
+	shl	$8, %eax
+	movzbl  -1(%rdi), %edi
+	shl	$8, %ecx
+	movzbl  -1(%rsi), %esi
+	or	%edi, %eax
+	or	%esi, %ecx
+	/* No borrow possible because the inputs are only 16 bits.  */
 	sub	%ecx, %eax
 	ret
 
@@ -931,6 +962,7 @@ L(14bytes):
 	mov	-14(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -940,18 +972,18 @@ L(14bytes):
 
 	.p2align 4
 L(6bytes):
-	mov	-6(%rdi), %eax
-	mov	-6(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin4bytes)
-L(2bytes):
-	movzwl	-2(%rsi), %ecx
+	/* Bytes 123456 loaded as ..654321.  */
 	movzwl	-2(%rdi), %eax
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
+	movzwl	-2(%rsi), %ecx
+	mov	-6(%rdi), %edi
+	mov	-6(%rsi), %esi
+	shl	$32, %rax
+	shl	$32, %rcx
+	or	%rdi, %rax
+	or	%rsi, %rcx
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -986,6 +1018,7 @@ L(11bytes):
 	mov	-11(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
 	cmp	%eax, %ecx
@@ -1012,6 +1045,7 @@ L(7bytes):
 	mov	-7(%rsi), %ecx
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
 	cmp	%eax, %ecx
@@ -1021,10 +1055,22 @@ L(7bytes):
 
 	.p2align 4
 L(3bytes):
+	/* Bytes 123 loaded as .123.  */
 	movzwl	-3(%rdi), %eax
+	bswap	%eax
 	movzwl	-3(%rsi), %ecx
-	cmp	%eax, %ecx
-	jne	L(diffin2bytes)
+	movzbl	-1(%rdi), %edi
+	bswap	%ecx
+	shr	$8, %eax
+	movzbl	-1(%rsi), %esi
+	shr	$8, %ecx
+	or	%edi, %eax
+	or	%esi, %ecx
+	/* No borrow possible because the inputs are only 24 bits.  */
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
 L(1bytes):
 	movzbl	-1(%rdi), %eax
 	movzbl	-1(%rsi), %ecx
@@ -1104,6 +1150,7 @@ L(21bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1140,6 +1187,7 @@ L(22bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1176,6 +1224,7 @@ L(23bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1296,7 +1345,10 @@ L(26bytes):
 	jne	L(diffin8bytes)
 	movzwl	-2(%rdi), %eax
 	movzwl	-2(%rsi), %ecx
-	jmp	L(diffin2bytes)
+	cmp	%eax, %ecx
+	jne	L(diffin4bytes)
+	xor	%eax, %eax
+	ret
 
 	.p2align 4
 L(75bytes):
@@ -1331,6 +1383,7 @@ L(27bytes):
 	mov	-11(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-4(%rdi), %eax
 	mov	-4(%rsi), %ecx
 	cmp	%eax, %ecx
@@ -1413,12 +1466,11 @@ L(29bytes):
 	pxor	%xmm1, %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(less16bytes)
-
 	mov	-13(%rdi), %rax
 	mov	-13(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
-
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1459,6 +1511,7 @@ L(30bytes):
 	mov	-14(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1499,6 +1552,7 @@ L(31bytes):
 	mov	-15(%rsi), %rcx
 	cmp	%rax, %rcx
 	jne	L(diffin8bytes)
+	/* Overlapping loads, reprocessing bytes already known to be equal.  */
 	mov	-8(%rdi), %rax
 	mov	-8(%rsi), %rcx
 	cmp	%rax, %rcx
@@ -1542,7 +1596,7 @@ L(32bytes):
 	ret
 
 /*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 aligned code block.
  */
 	.p2align 3
 L(less16bytes):
@@ -1553,43 +1607,49 @@ L(less16bytes):
 	jne	L(diffin8bytes)
 	mov	8(%rsi, %rdx), %rcx
 	mov	8(%rdi, %rdx), %rax
+	cmp	%rax, %rcx
+	jne	L(diffin8bytes)
+	xor	%eax, %eax
+	ret
+
+	.p2align 4
 L(diffin8bytes):
+# ifndef USE_AS_WMEMCMP
+/* Expects left word (LHS) in %rax, right word (RHS) in %rcx, and they
+   must be different.  Converts both words to big-endian and returns
+   the result of their comparison.  */
+	bswap	%rcx
+	bswap	%rax
+	cmp	%rcx, %rax
+	sbb	%eax, %eax	/* Set to -1 if LHS < RHS, otherwise 0.  */
+	or	$1, %eax	/* Turn 0 into 1, but preserve -1.  */
+	ret
+
+	.p2align 4
+L(diffin4bytes):
+/* Expects left word (LHS) in %eax, right word (RHS) in %ecx, and they
+   must be different.  Converts both words to big-endian and returns
+   the result of their comparison.  32-bit bswap is faster, otherwise
+   we could use the 64-bit version.  */
+	bswap	%ecx
+	bswap	%eax
+	cmp	%ecx, %eax
+	sbb	%eax, %eax	/* Set to -1 if LHS < RHS, otherwise 0.  */
+	or	$1, %eax	/* Turn 0 into 1, but preserve -1.  */
+	ret
+
+# else
+/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	shr	$32, %rcx
 	shr	$32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
 	cmp	%eax, %ecx
 	jne	L(diffin4bytes)
 	xor	%eax, %eax
 	ret
-# endif
-
 L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
-	cmp	%cx, %ax
-	jne	L(diffin2bytes)
-	shr	$16, %ecx
-	shr	$16, %eax
-L(diffin2bytes):
-	cmp	%cl, %al
-	jne	L(end)
-	and	$0xffff, %eax
-	and	$0xffff, %ecx
-	sub	%ecx, %eax
-	ret
 
-	.p2align 4
-L(end):
-	and	$0xff, %eax
-	and	$0xff, %ecx
-	sub	%ecx, %eax
-	ret
-# else
-
-/* for wmemcmp */
 	mov	$1, %eax
 	jl	L(nequal_bigger)
 	neg	%eax

References:
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: Florian Weimer
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: Carlos O'Donell
- Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
  - From: Florian Weimer

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]