This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction


From: Ma Ling <ling.ml@alibaba-inc.com>

In this patch we use the similar approach with memcpy to avoid branch instructions
and force destination to be aligned with avx instruction.
By gcc.403 benchmark we find memset spend more time than memcpy by 5~10 times.
The benchmark also indicate this patch improve performance from  30% to 100%
compared with original __memset_sse2.

Ondra, I sent test gcc.403 test suit ,patch for glibc and readme.txt as well.  

Thanks
Ling
---
In this version we do clearify vzeroupper instruction to avoid SAVE & STORE Penalty.
vpshufb need only one cycle to fill xmm0 register, thanks Ondra.

 sysdeps/x86_64/multiarch/Makefile          |   2 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   2 +
 sysdeps/x86_64/multiarch/memset-avx2.S     | 202 +++++++++++++++++++++++++++++
 3 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f92cf18..ae666bf 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
 		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3
+		   memcmp-ssse3 memset-avx2
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5639702..24d05d7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -67,12 +67,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2, __memset_chk_avx2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_x86_64))
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64))
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..dc778c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,202 @@
+/* memset with AVX2
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+#ifndef MEMSET
+# define MEMSET	__memset_avx2
+# define MEMSET_CHK	__memset_chk_avx2
+#endif
+
+	.section .text.avx2,"ax",@progbits
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vpxor	%xmm0, %xmm0, %xmm0
+	vmovd %esi, %xmm1
+	lea	(%rdi, %rdx), %r8
+	vpshufb	%xmm0, %xmm1, %xmm0
+	mov	%rdi, %rax
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	xor	%ecx, %ecx
+	mov %sil, %cl
+	mov %cl, %ch
+	cmp	$128, %rdx
+	jb	L(less_128bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, 0x40(%rdi)
+	vmovups %xmm0, 0x50(%rdi)
+	vmovups %xmm0, 0x60(%rdi)
+	vmovups %xmm0, 0x70(%rdi)
+	vmovups %xmm0, -0x80(%r8)
+	vmovups %xmm0, -0x70(%r8)
+	vmovups %xmm0, -0x60(%r8)
+	vmovups %xmm0, -0x50(%r8)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	xor	%esi, %esi
+	mov	%ecx, %esi
+	shl	$16, %ecx
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	orl	%esi, %ecx
+	mov	%ecx, %esi
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_32bytes):
+	shl	$32, %rcx
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(less_16bytes):
+	or	%rsi, %rcx
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	mov %rcx, (%rdi)
+	mov %rcx, -0x08(%r8)
+	ret
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov %ecx, (%rdi)
+	mov %ecx, -0x04(%r8)
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%r8)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %edx
+	jb	L(less_1bytes)
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+	vinserti128 $1, %xmm0, %ymm0, %ymm0
+	vmovups	%ymm0, (%rdi)
+	mov	%rdi, %r9
+	and	$-0x20, %rdi
+	add	$32, %rdi
+	sub	%rdi, %r9
+	add	%r9, %rdx
+	cmp	$4096, %rdx
+	ja	L(gobble_data)
+
+	sub	$0x80, %rdx
+L(gobble_128_loop):
+	prefetcht0	0x1c0(%rdi)
+	vmovaps	%ymm0, (%rdi)
+	prefetcht0	0x280(%rdi)
+	vmovaps	%ymm0, 0x20(%rdi)
+	vmovaps	%ymm0, 0x40(%rdi)
+	vmovaps	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_128_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r9
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %r9
+#endif
+	shl	$4, %r9
+	cmp	%r9, %rdx
+	ja	L(gobble_big_data)
+	mov	%rax, %r9
+	mov	%esi, %eax
+	mov	%rdx, %rcx
+	rep	stosb
+	mov	%r9, %rax
+	vzeroupper
+	ret
+
+	ALIGN(4)
+L(gobble_big_data):
+	sub	$0x80, %rdx
+L(gobble_big_data_loop):
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm0, 0x20(%rdi)
+	vmovntdq	%ymm0, 0x40(%rdi)
+	vmovntdq	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_big_data_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	vzeroupper
+	sfence
+	ret
+
+END (MEMSET)
+#endif
-- 
1.8.1.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]