This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 1/2] Improve memset for older amd processors.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 30 Sep 2013 21:35:39 +0200
- Subject: [PATCH 1/2] Improve memset for older amd processors.
- Authentication-results: sourceware.org; auth=none
Hi, I looked how to improve memset and it does not fit much to ifuncs.
This handles situation where I am certain about is that in old athlons
the following implementation is better.
I moved current implementation to memset-ssse3.S which has side benefit
that I could use pshufb which gives around 0.5%.
Problem is that a rep implementation is also better for this xeon by 12% and
I do not know how to add special case to ifunc.
http://kam.mff.cuni.cz/~ondra/benchmark_string/xeon/memset_profile_loop/results_gcc/result.html
A second part will be improve current implementation for rest of
machines.
* sysdeps/x86_64/multiarch/memset.S: Add ifuncs.
* sysdeps/x86_64/multiarch/memset.S: Move to ...
* sysdeps/x86_64/multiarch/memset-ssse3.S: ... here.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
Add memset-ssse3.
---
sysdeps/x86_64/memset.S | 118 +++++++++--------------------
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 5 ++
sysdeps/x86_64/multiarch/memset-ssse3.S | 100 ++++++++++++++++++++++++
sysdeps/x86_64/multiarch/memset.S | 71 +++++++++++++++++
5 files changed, 213 insertions(+), 83 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/memset.S
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 6c69f4b..e75d3f6 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -19,31 +19,15 @@
#include <sysdep.h>
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
.text
#if !defined NOT_IN_libc
ENTRY(__bzero)
- movq %rdi, %rax /* Set return value. */
movq %rsi, %rdx /* Set n. */
- pxor %xmm8, %xmm8
+ xor %rax, %rax
jmp L(entry_from_bzero)
END(__bzero)
weak_alias (__bzero, bzero)
-/* Like memset but takes additional parameter with return value. */
-ENTRY(__memset_tail)
- movq %rcx, %rax /* Set return value. */
-
- movd %esi, %xmm8
- punpcklbw %xmm8, %xmm8
- punpcklwd %xmm8, %xmm8
- pshufd $0, %xmm8, %xmm8
-
- jmp L(entry_from_bzero)
-END(__memset_tail)
#endif
#if defined PIC && !defined NOT_IN_libc
@@ -54,76 +38,46 @@ END_CHK (__memset_chk)
#endif
ENTRY (memset)
- movd %esi, %xmm8
- movq %rdi, %rax
- punpcklbw %xmm8, %xmm8
- punpcklwd %xmm8, %xmm8
- pshufd $0, %xmm8, %xmm8
+ movzbl %sil, %eax
+ lea (%rdi, %rdx), %rcx
+ movabsq $72340172838076673, %rsi
+ imulq %rsi, %rax
L(entry_from_bzero):
- cmpq $64, %rdx
- ja L(loop_start)
- cmpq $16, %rdx
- jbe L(less_16_bytes)
- cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu %xmm8, -16(%rdi,%rdx)
- ja L(between_32_64_bytes)
-L(return):
- rep
- ret
- ALIGN (4)
-L(between_32_64_bytes):
- movdqu %xmm8, 16(%rdi)
- movdqu %xmm8, -32(%rdi,%rdx)
+ cmp $7, %rdx
+ jbe L(less_8_bytes)
+ movq %rax, (%rdi)
+ movq %rdi, %rsi
+ leaq 8(%rdi), %rdi
+ movq %rax, -8(%rcx)
+ andq $-8, %rdi
+ subq %rdi, %rcx
+ shrq $3, %rcx
+ rep stosq
+ movq %rsi, %rax
ret
- ALIGN (4)
-L(loop_start):
- leaq 64(%rdi), %rcx
- movdqu %xmm8, (%rdi)
- andq $-64, %rcx
- movdqu %xmm8, -16(%rdi,%rdx)
- movdqu %xmm8, 16(%rdi)
- movdqu %xmm8, -32(%rdi,%rdx)
- movdqu %xmm8, 32(%rdi)
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu %xmm8, 48(%rdi)
- movdqu %xmm8, -64(%rdi,%rdx)
- addq %rdi, %rdx
- andq $-64, %rdx
- cmpq %rdx, %rcx
- je L(return)
- ALIGN (4)
-L(loop):
- movdqa %xmm8, (%rcx)
- movdqa %xmm8, 16(%rcx)
- movdqa %xmm8, 32(%rcx)
- movdqa %xmm8, 48(%rcx)
- addq $64, %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- rep
- ret
-L(less_16_bytes):
- movq %xmm8, %rcx
- testb $24, %dl
- jne L(between8_16bytes)
+
+.p2align 4
+L(less_8_bytes):
+ movq %rax, %rsi
+ movq %rdi, %rax
testb $4, %dl
- jne L(between4_7bytes)
- testb $1, %dl
- je L(odd_byte)
- movb %cl, (%rdi)
-L(odd_byte):
- testb $2, %dl
- je L(return)
- movw %cx, -2(%rax,%rdx)
+ jne L(between_4_7_bytes)
+ cmp $1, %dl
+ jbe L(between_0_1_bytes)
+ movw %si, -2(%rcx)
+ movb %sil, (%rdi)
ret
-L(between4_7bytes):
- movl %ecx, (%rdi)
- movl %ecx, -4(%rdi,%rdx)
+
+.p2align 3
+L(between_4_7_bytes):
+ movl %esi, (%rdi)
+ movl %esi, -4(%rcx)
ret
-L(between8_16bytes):
- movq %rcx, (%rdi)
- movq %rcx, -8(%rdi,%rdx)
+
+L(between_0_1_bytes):
+ jb L(zero_byte)
+ movb %sil, (%rdi)
+ L(zero_byte):
ret
END (memset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9fd0fd6..3fb3647 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3
+ strchr-sse2-no-bsf memcmp-ssse3 memset-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 71beab8..8bf1d53 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -37,6 +37,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
+ /* Support sysdeps/x86_64/multiarch/memset.S. */
+ IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, HAS_SSSE3, __memset_ssse3)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2))
+
/* Support sysdeps/x86_64/multiarch/memcmp.S. */
IFUNC_IMPL (i, name, memcmp,
IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
diff --git a/sysdeps/x86_64/multiarch/memset-ssse3.S b/sysdeps/x86_64/multiarch/memset-ssse3.S
new file mode 100644
index 0000000..68c9490
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-ssse3.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+ Optimized version for x86-64.
+ Copyright (C) 2002-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+ .text
+ENTRY (__memset_ssse3)
+ movd %esi, %xmm8
+ movq %rdi, %rax
+ punpcklbw %xmm8, %xmm8
+ punpcklwd %xmm8, %xmm8
+ pshufd $0, %xmm8, %xmm8
+ cmpq $64, %rdx
+ ja L(loop_start)
+ cmpq $16, %rdx
+ jbe L(less_16_bytes)
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja L(between_32_64_bytes)
+L(return):
+ rep
+ ret
+ ALIGN (4)
+L(between_32_64_bytes):
+ movdqu %xmm8, 16(%rdi)
+ movdqu %xmm8, -32(%rdi,%rdx)
+ ret
+ ALIGN (4)
+L(loop_start):
+ leaq 64(%rdi), %rcx
+ movdqu %xmm8, (%rdi)
+ andq $-64, %rcx
+ movdqu %xmm8, -16(%rdi,%rdx)
+ movdqu %xmm8, 16(%rdi)
+ movdqu %xmm8, -32(%rdi,%rdx)
+ movdqu %xmm8, 32(%rdi)
+ movdqu %xmm8, -48(%rdi,%rdx)
+ movdqu %xmm8, 48(%rdi)
+ movdqu %xmm8, -64(%rdi,%rdx)
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ cmpq %rdx, %rcx
+ je L(return)
+ ALIGN (4)
+L(loop):
+ movdqa %xmm8, (%rcx)
+ movdqa %xmm8, 16(%rcx)
+ movdqa %xmm8, 32(%rcx)
+ movdqa %xmm8, 48(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ rep
+ ret
+L(less_16_bytes):
+ movq %xmm8, %rcx
+ testb $24, %dl
+ jne L(between8_16bytes)
+ testb $4, %dl
+ jne L(between4_7bytes)
+ testb $1, %dl
+ je L(odd_byte)
+ movb %cl, (%rdi)
+L(odd_byte):
+ testb $2, %dl
+ je L(return)
+ movw %cx, -2(%rax,%rdx)
+ ret
+L(between4_7bytes):
+ movl %ecx, (%rdi)
+ movl %ecx, -4(%rdi,%rdx)
+ ret
+L(between8_16bytes):
+ movq %rcx, (%rdi)
+ movq %rcx, -8(%rdi,%rdx)
+ ret
+
+END (__memset_ssse3)
+
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000..06dfe94
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,71 @@
+/* Multiple versions of memset
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib. */
+#ifndef NOT_IN_libc
+ENTRY(memset)
+ .type memset, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memset_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq __memset_ssse3(%rip), %rax
+2: ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memset_sse2, @function; \
+ .globl __memset_sse2; \
+ .p2align 4; \
+ __memset_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memset_sse2, .-__memset_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memset_chk_sse2, @function; \
+ .globl __memset_chk_sse2; \
+ .p2align 4; \
+ __memset_chk_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memset_chk_sse2, .-__memset_chk_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+ The speedup we get from using GPR instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memset; __GI_memset = __memset_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
--
1.8.4.rc3