This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 1/2] Improve memset for older amd processors.


Hi, I looked how to improve memset and it does not fit much to ifuncs.

This handles situation where I am certain about is that in old athlons
the following implementation is better.

I moved current implementation to memset-ssse3.S which has side benefit
that I could use pshufb which gives around 0.5%.

Problem is that a rep implementation is also better for this xeon by 12% and
I do not know how to add special case to ifunc.
http://kam.mff.cuni.cz/~ondra/benchmark_string/xeon/memset_profile_loop/results_gcc/result.html

A second part will be improve current implementation for rest of
machines.


	* sysdeps/x86_64/multiarch/memset.S: Add ifuncs.
	* sysdeps/x86_64/multiarch/memset.S: Move to ...
	* sysdeps/x86_64/multiarch/memset-ssse3.S: ... here.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): 
	Add memset-ssse3.

---
 sysdeps/x86_64/memset.S                    | 118 +++++++++--------------------
 sysdeps/x86_64/multiarch/Makefile          |   2 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   5 ++
 sysdeps/x86_64/multiarch/memset-ssse3.S    | 100 ++++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memset.S          |  71 +++++++++++++++++
 5 files changed, 213 insertions(+), 83 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/memset.S

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 6c69f4b..e75d3f6 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -19,31 +19,15 @@
 
 #include <sysdep.h>
 
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
 	.text
 #if !defined NOT_IN_libc
 ENTRY(__bzero)
-	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	xor	%rax, %rax
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
 
-/* Like memset but takes additional parameter with return value.  */
-ENTRY(__memset_tail)
-	movq	%rcx, %rax /* Set return value.  */
-
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
-
-	jmp	L(entry_from_bzero)
-END(__memset_tail)
 #endif
 
 #if defined PIC && !defined NOT_IN_libc
@@ -54,76 +38,46 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
-	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movzbl	%sil, %eax
+	lea	(%rdi, %rdx), %rcx
+	movabsq	$72340172838076673, %rsi
+	imulq	%rsi, %rax
 L(entry_from_bzero):
-	cmpq	$64, %rdx
-	ja	L(loop_start)
-	cmpq	$16, %rdx
-	jbe	L(less_16_bytes)
-	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	ja	L(between_32_64_bytes)
-L(return):
-	rep
-	ret
-	ALIGN (4)
-L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+	cmp	$7, %rdx
+	jbe L(less_8_bytes)
+	movq	%rax, (%rdi)
+	movq	%rdi, %rsi
+	leaq	8(%rdi), %rdi
+	movq	%rax, -8(%rcx)
+	andq	$-8, %rdi
+	subq	%rdi, %rcx
+	shrq	$3, %rcx
+	rep stosq
+	movq	%rsi, %rax
 	ret
-	ALIGN (4)
-L(loop_start):
-	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
-	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
-	ALIGN (4)
-L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	rep
-	ret
-L(less_16_bytes):
-	movq %xmm8, %rcx
-	testb	$24, %dl
-	jne	L(between8_16bytes)
+
+.p2align 4
+L(less_8_bytes):
+	movq	%rax, %rsi
+	movq	%rdi, %rax
 	testb	$4, %dl
-	jne	L(between4_7bytes)
-	testb	$1, %dl
-	je	L(odd_byte)
-	movb	%cl, (%rdi)
-L(odd_byte):
-	testb	$2, %dl
-	je	L(return)
-	movw	%cx, -2(%rax,%rdx)
+	jne L(between_4_7_bytes)
+	cmp	$1, %dl
+	jbe L(between_0_1_bytes)
+	movw	%si, -2(%rcx)
+	movb	%sil, (%rdi)
 	ret
-L(between4_7bytes):
-	movl	%ecx, (%rdi)
-	movl	%ecx, -4(%rdi,%rdx)
+
+.p2align 3
+L(between_4_7_bytes):
+	movl	%esi, (%rdi)
+	movl	%esi, -4(%rcx)
 	ret
-L(between8_16bytes):
-	movq	%rcx, (%rdi)
-	movq	%rcx, -8(%rdi,%rdx)
+
+L(between_0_1_bytes):
+	jb L(zero_byte)
+	movb	%sil, (%rdi)
+	L(zero_byte):
 	ret
 
 END (memset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9fd0fd6..3fb3647 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3
+		   strchr-sse2-no-bsf memcmp-ssse3 memset-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 71beab8..8bf1d53 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -37,6 +37,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   size_t i = 0;
 
+  /* Support sysdeps/x86_64/multiarch/memset.S.  */
+  IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_SSSE3, __memset_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
diff --git a/sysdeps/x86_64/multiarch/memset-ssse3.S b/sysdeps/x86_64/multiarch/memset-ssse3.S
new file mode 100644
index 0000000..68c9490
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-ssse3.S
@@ -0,0 +1,100 @@
+/* memset/bzero -- set memory area to CH/0
+   Optimized version for x86-64.
+   Copyright (C) 2002-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+	.text
+ENTRY (__memset_ssse3)
+	movd	%esi, %xmm8
+	movq	%rdi, %rax
+        punpcklbw       %xmm8, %xmm8
+        punpcklwd       %xmm8, %xmm8
+        pshufd  $0, %xmm8, %xmm8
+	cmpq	$64, %rdx
+	ja	L(loop_start)
+	cmpq	$16, %rdx
+	jbe	L(less_16_bytes)
+	cmpq	$32, %rdx
+	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	ja	L(between_32_64_bytes)
+L(return):
+	rep
+	ret
+	ALIGN (4)
+L(between_32_64_bytes):
+	movdqu	%xmm8, 16(%rdi)
+	movdqu	%xmm8, -32(%rdi,%rdx)
+	ret
+	ALIGN (4)
+L(loop_start):
+	leaq	64(%rdi), %rcx
+	movdqu	%xmm8, (%rdi)
+	andq	$-64, %rcx
+	movdqu	%xmm8, -16(%rdi,%rdx)
+	movdqu	%xmm8, 16(%rdi)
+	movdqu	%xmm8, -32(%rdi,%rdx)
+	movdqu	%xmm8, 32(%rdi)
+	movdqu	%xmm8, -48(%rdi,%rdx)
+	movdqu	%xmm8, 48(%rdi)
+	movdqu	%xmm8, -64(%rdi,%rdx)
+	addq	%rdi, %rdx
+	andq	$-64, %rdx
+	cmpq	%rdx, %rcx
+	je	L(return)
+	ALIGN (4)
+L(loop):
+	movdqa	%xmm8, (%rcx)
+	movdqa	%xmm8, 16(%rcx)
+	movdqa	%xmm8, 32(%rcx)
+	movdqa	%xmm8, 48(%rcx)
+	addq	$64, %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	rep
+	ret
+L(less_16_bytes):
+	movq	%xmm8, %rcx
+	testb	$24, %dl
+	jne	L(between8_16bytes)
+	testb	$4, %dl
+	jne	L(between4_7bytes)
+	testb	$1, %dl
+	je	L(odd_byte)
+	movb	%cl, (%rdi)
+L(odd_byte):
+	testb	$2, %dl
+	je	L(return)
+	movw	%cx, -2(%rax,%rdx)
+	ret
+L(between4_7bytes):
+	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rdi,%rdx)
+	ret
+L(between8_16bytes):
+	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rdi,%rdx)
+	ret
+
+END (__memset_ssse3)
+
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
new file mode 100644
index 0000000..06dfe94
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -0,0 +1,71 @@
+/* Multiple versions of memset
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib.  */
+#ifndef NOT_IN_libc
+ENTRY(memset)
+	.type	memset, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memset_sse2(%rip), %rax
+	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	__memset_ssse3(%rip), %rax
+2:	ret
+END(memset)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memset_sse2, @function; \
+	.globl __memset_sse2; \
+	.p2align 4; \
+	__memset_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memset_sse2, .-__memset_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memset_chk_sse2, @function; \
+	.globl __memset_chk_sse2; \
+	.p2align 4; \
+	__memset_chk_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memset_chk_sse2, .-__memset_chk_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memset calls through a PLT.
+   The speedup we get from using GPR instruction is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(original, alias)
+#endif
+
+#include "../memset.S"
-- 
1.8.4.rc3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]