This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
- From: Florian Weimer <fweimer at redhat dot com>
- To: "Carlos O'Donell" <carlos at redhat dot com>, GNU C Library <libc-alpha at sourceware dot org>
- Cc: OndÅej BÃlka <neleai at seznam dot cz>
- Date: Tue, 04 Feb 2014 13:33:34 +0100
- Subject: Re: [PATCH] Optimize SSE 4.1 x86_64 memcmp
- Authentication-results: sourceware.org; auth=none
- References: <52EBBCC2 dot 7090807 at redhat dot com> <52EBC480 dot 4000509 at redhat dot com> <52EF6E9E dot 8050708 at redhat dot com> <52F00A7D dot 5080008 at redhat dot com> <52F0DD76 dot 30900 at redhat dot com>
On 02/04/2014 01:30 PM, Florian Weimer wrote:
On 02/03/2014 10:30 PM, Carlos O'Donell wrote:
Unaligned loads overlapping with data previously compared as equal.
This data cannot affect the outcome of the final comparison. (This
trick was already used before, but it's not entirely obvious.)
The SETNE/CMOVAE sequence for the branchless sign function.
(It turns out that this part is not actually present anymore.)
No, that looks good from a commenting perspective.
I have incorporated the comment updates.
And here's the patch.
--
Florian Weimer / Red Hat Product Security Team
2014-02-04 Florian Weimer <fweimer@redhat.com>
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Vectorize the comparison
of differing bytes. Correct spelling. Add documentation.
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index e753d62..10b4b69 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -36,9 +36,29 @@
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
+ memcmp has to use UNSIGNED comparison for elements.
*/
+/* This implementation uses special cases for lengths up to 79.
+ In most of them, we compute the ordering of the differing bytes
+ with a big-endian comparison in diffin8bytes and diffin4bytes.
+ This comparison does not depend on the location of the differing
+ byte in the word.
+
+ If possible, overlapping 4-byte or 8-byte loads are used to
+ minimize small-size loads and comparisons. The bytes in the
+ overlapping part are known to be equal and thus do not affect the
+ outcome of the final comparison.
+
+ In the remaining cases, comments indicate how the bytes in the
+ input are loaded into the registers. Bytes are labeled 1, 2, ...
+ in the order of increasing addresses, starting from the current
+ read position (which less %rdi/%rsi minus the number of remaining
+ bytes). Resulting register contents is presented from most
+ significant byte to least. Zero bytes are indicated with a dot
+ ".". For example, a regular 4-byte load of the bytes 1234 will
+ result in 64-bit register contents of ...4321. */
+
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
@@ -419,7 +439,7 @@ L(less128bytesin2aligned):
ptest %xmm2, %xmm0
jnc L(64bytesin256)
cmp $32, %rdx
- jb L(less32bytesin64in2alinged)
+ jb L(less32bytesin64in2aligned)
movdqa 64(%rdi), %xmm2
pxor 64(%rsi), %xmm2
@@ -433,7 +453,7 @@ L(less128bytesin2aligned):
sub $32, %rdx
add $32, %rdi
add $32, %rsi
-L(less32bytesin64in2alinged):
+L(less32bytesin64in2aligned):
add $64, %rdi
add $64, %rsi
add %rdx, %rsi
@@ -446,7 +466,7 @@ L(128bytesormorein2aligned):
ja L(512bytesormorein2aligned)
cmp $256, %rdx
ja L(256bytesormorein2aligned)
-L(less256bytesin2alinged):
+L(less256bytesin2aligned):
sub $128, %rdx
movdqa (%rdi), %xmm2
@@ -603,13 +623,13 @@ L(256bytesormorein2aligned):
add $256, %rdi
cmp $128, %rdx
- jae L(less256bytesin2alinged)
+ jae L(less256bytesin2aligned)
cmp $64, %rdx
jae L(less128bytesin2aligned)
cmp $32, %rdx
- jb L(less32bytesin256in2alinged)
+ jb L(less32bytesin256in2aligned)
movdqa (%rdi), %xmm2
pxor (%rsi), %xmm2
@@ -623,7 +643,7 @@ L(256bytesormorein2aligned):
sub $32, %rdx
add $32, %rdi
add $32, %rsi
-L(less32bytesin256in2alinged):
+L(less32bytesin256in2aligned):
add %rdx, %rsi
add %rdx, %rdi
BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
@@ -866,6 +886,7 @@ L(13bytes):
mov -13(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -875,13 +896,18 @@ L(13bytes):
.p2align 4
L(5bytes):
+ /* Bytes 12345 loaded as ...54321. */
mov -5(%rdi), %eax
mov -5(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
- movzbl -1(%rdi), %eax
- movzbl -1(%rsi), %edx
- sub %edx, %eax
+ movzbl -1(%rdi), %edi
+ movzbl -1(%rsi), %esi
+ shl $32, %rdi
+ shl $32, %rsi
+ or %rdi, %rax
+ or %rsi, %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
ret
.p2align 4
@@ -916,12 +942,17 @@ L(10bytes):
mov -10(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
+L(2bytes):
+ /* Bytes 12 loaded as ..12. */
+ movzbl -2(%rdi), %eax
+ movzbl -2(%rsi), %ecx
+ shl $8, %eax
+ movzbl -1(%rdi), %edi
+ shl $8, %ecx
+ movzbl -1(%rsi), %esi
+ or %edi, %eax
+ or %esi, %ecx
+ /* No borrow possible because the inputs are only 16 bits. */
sub %ecx, %eax
ret
@@ -931,6 +962,7 @@ L(14bytes):
mov -14(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -940,18 +972,18 @@ L(14bytes):
.p2align 4
L(6bytes):
- mov -6(%rdi), %eax
- mov -6(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin4bytes)
-L(2bytes):
- movzwl -2(%rsi), %ecx
+ /* Bytes 123456 loaded as ..654321. */
movzwl -2(%rdi), %eax
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
+ movzwl -2(%rsi), %ecx
+ mov -6(%rdi), %edi
+ mov -6(%rsi), %esi
+ shl $32, %rax
+ shl $32, %rcx
+ or %rdi, %rax
+ or %rsi, %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
ret
.p2align 4
@@ -986,6 +1018,7 @@ L(11bytes):
mov -11(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
cmp %eax, %ecx
@@ -1012,6 +1045,7 @@ L(7bytes):
mov -7(%rsi), %ecx
cmp %eax, %ecx
jne L(diffin4bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
cmp %eax, %ecx
@@ -1021,10 +1055,22 @@ L(7bytes):
.p2align 4
L(3bytes):
+ /* Bytes 123 loaded as .123. */
movzwl -3(%rdi), %eax
+ bswap %eax
movzwl -3(%rsi), %ecx
- cmp %eax, %ecx
- jne L(diffin2bytes)
+ movzbl -1(%rdi), %edi
+ bswap %ecx
+ shr $8, %eax
+ movzbl -1(%rsi), %esi
+ shr $8, %ecx
+ or %edi, %eax
+ or %esi, %ecx
+ /* No borrow possible because the inputs are only 24 bits. */
+ sub %ecx, %eax
+ ret
+
+ .p2align 4
L(1bytes):
movzbl -1(%rdi), %eax
movzbl -1(%rsi), %ecx
@@ -1104,6 +1150,7 @@ L(21bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1140,6 +1187,7 @@ L(22bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1176,6 +1224,7 @@ L(23bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1296,7 +1345,10 @@ L(26bytes):
jne L(diffin8bytes)
movzwl -2(%rdi), %eax
movzwl -2(%rsi), %ecx
- jmp L(diffin2bytes)
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
.p2align 4
L(75bytes):
@@ -1331,6 +1383,7 @@ L(27bytes):
mov -11(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -4(%rdi), %eax
mov -4(%rsi), %ecx
cmp %eax, %ecx
@@ -1413,12 +1466,11 @@ L(29bytes):
pxor %xmm1, %xmm2
ptest %xmm2, %xmm0
jnc L(less16bytes)
-
mov -13(%rdi), %rax
mov -13(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
-
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1459,6 +1511,7 @@ L(30bytes):
mov -14(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1499,6 +1552,7 @@ L(31bytes):
mov -15(%rsi), %rcx
cmp %rax, %rcx
jne L(diffin8bytes)
+ /* Overlapping loads, reprocessing bytes already known to be equal. */
mov -8(%rdi), %rax
mov -8(%rsi), %rcx
cmp %rax, %rcx
@@ -1542,7 +1596,7 @@ L(32bytes):
ret
/*
- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 aligned code block.
*/
.p2align 3
L(less16bytes):
@@ -1553,43 +1607,49 @@ L(less16bytes):
jne L(diffin8bytes)
mov 8(%rsi, %rdx), %rcx
mov 8(%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
L(diffin8bytes):
+# ifndef USE_AS_WMEMCMP
+/* Expects left word (LHS) in %rax, right word (RHS) in %rcx, and they
+ must be different. Converts both words to big-endian and returns
+ the result of their comparison. */
+ bswap %rcx
+ bswap %rax
+ cmp %rcx, %rax
+ sbb %eax, %eax /* Set to -1 if LHS < RHS, otherwise 0. */
+ or $1, %eax /* Turn 0 into 1, but preserve -1. */
+ ret
+
+ .p2align 4
+L(diffin4bytes):
+/* Expects left word (LHS) in %eax, right word (RHS) in %ecx, and they
+ must be different. Converts both words to big-endian and returns
+ the result of their comparison. 32-bit bswap is faster, otherwise
+ we could use the 64-bit version. */
+ bswap %ecx
+ bswap %eax
+ cmp %ecx, %eax
+ sbb %eax, %eax /* Set to -1 if LHS < RHS, otherwise 0. */
+ or $1, %eax /* Turn 0 into 1, but preserve -1. */
+ ret
+
+# else
+/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
shr $32, %rcx
shr $32, %rax
-
-# ifdef USE_AS_WMEMCMP
-/* for wmemcmp */
cmp %eax, %ecx
jne L(diffin4bytes)
xor %eax, %eax
ret
-# endif
-
L(diffin4bytes):
-# ifndef USE_AS_WMEMCMP
- cmp %cx, %ax
- jne L(diffin2bytes)
- shr $16, %ecx
- shr $16, %eax
-L(diffin2bytes):
- cmp %cl, %al
- jne L(end)
- and $0xffff, %eax
- and $0xffff, %ecx
- sub %ecx, %eax
- ret
- .p2align 4
-L(end):
- and $0xff, %eax
- and $0xff, %ecx
- sub %ecx, %eax
- ret
-# else
-
-/* for wmemcmp */
mov $1, %eax
jl L(nequal_bigger)
neg %eax