This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction


On Fri, Apr 04, 2014 at 03:17:42AM -0400, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
> 
> In this patch we manage to reduce miss branch prediction by 
> avoid using branch instructions and force destination to be aligned
> with avx instruction. 
> 
> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from 2% to 12% or 2% to 21% compared with original memset implemented
> by sse2 and ssse3 respectively.
> 
> 					memcpy-AVX		memcpy-SSE2		memcpy-SSSE3	AVX vs SSE2		AVX vs SSSE3
> 	gcc.166.i		302551459		332189574		345378682		1.097960575		1.141553517
> 	gcc.200.i		138036144		155904648		168229120		1.129448009		1.218732392
> 	gcc.cp-decl.i	283963419		296759183		312970805		1.045061311		1.102151841
> 	gcc.c-typeck.i	616484068		664855801		682119551		1.078463882		1.106467444
> 	gcc.expr2.i		781639964		858486085		893803320		1.098313961		1.143497468
> 	gcc.expr.i		580765337		593709446		596005444		1.022288019		1.02624142
> 	gcc.g23.i		1063726457		1162692750		1177232886		1.093037352		1.106706408
> 	gcc.s04.i		892109530		948328853		963836294		1.063018409		1.080401298
> 	gcc.scilab.i	62298843		66606465		72922104		1.069144494		1.170521
> 

similar comments as memset here.

> +/* memcpy with AVX
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.
no contributed by now.

> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif

expand to p2align

> +	lea	(%rsi, %rdx), %r8
> +	lea	(%rdi, %rdx), %r9

rcx instead r8 saves byte, changing r9 needs some work. Also could you
save something by ymm registers or is that killed by latentcy.

> +	cmp	$256, %rdx
> +	ja	L(256bytesormore)
> +	cmp	$128, %edx
> +	jb	L(less_128bytes)
> +	vmovups (%rsi), %xmm0
> +	vmovups 0x10(%rsi), %xmm1
> +	vmovups 0x20(%rsi), %xmm2

snip

> +	ALIGN(4)
> +L(less_16bytes):
> +	cmp	$8, %edx
> +	jb	L(less_8bytes)
> +	movq (%rsi),	%rcx
> +	movq -0x08(%r8),	%r10
> +	movq %rcx, (%rdi)
> +	movq %r10, -0x08(%r9)

rdx instead r10 saves 2 bytes.


> +L(less_4bytes):
> +	cmp	$2, %edx
> +	jb	L(less_2bytes)
> +	mov (%rsi),	%cx
> +	mov -0x02(%r8),	%dx
> +	mov %cx, (%rdi)
> +	mov %dx, -0x02(%r9)
> +	ret
> +	ALIGN(4)
> +L(less_2bytes):
> +	cmp	$1, %rdx
> +	jb	L(less_0bytes)
> +	mov	(%rsi), %cl
> +	mov	%cl,	(%rdi)
> +L(less_0bytes):
> +	ret
> +
again you could save comparison here.

> +	ALIGN(4)
> +L(256bytesormore):
> +
> +#ifdef USE_AS_MEMMOVE
> +	cmp	%rsi, %rdi
> +	jae	L(copy_backward)
> +#endif

this could be unpredictable branch, backward copy only when overlap is
better.

> +	mov	%rdi, %r10
> +	cmp	$2048, %rdx
> +	jae	L(gobble_data_movsb)
> +	vmovups -0x80(%r8), %xmm8
> +	vmovups -0x70(%r8), %xmm9
> +	and	$-32, %rdi
> +	add	$32, %rdi
> +	vmovups -0x60(%r8), %xmm10
> +	vmovups -0x50(%r8), %xmm11
> +	mov	%rdi, %r11
> +	sub	%r10, %r11
> +	vmovups -0x40(%r8), %xmm12
> +	vmovups -0x30(%r8), %xmm13
> +	sub	%r11, %rdx
> +	vmovups -0x20(%r8), %xmm14
> +	vmovups -0x10(%r8), %xmm15
> +	vmovups	(%rsi), %ymm4
> +	add	%r11, %rsi

does copying moving vmovups %xmm8, -0x80(%r9)... here help?

Also check if alignment for loop help.

> +	sub	$0x80, %rdx
> +L(goble_128_loop):
> +	vmovups (%rsi), %ymm0
> +	vmovups 0x20(%rsi), %ymm1
> +	vmovups 0x40(%rsi), %ymm2
> +	vmovups 0x60(%rsi), %ymm3
> +	lea	0x80(%rsi), %rsi
> +	vmovaps %ymm0, (%rdi)
> +	vmovaps %ymm1, 0x20(%rdi)
> +	vmovaps %ymm2, 0x40(%rdi)
> +	vmovaps %ymm3, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(goble_128_loop)
> +	vmovups	%ymm4, (%r10)
> +	vzeroupper
> +	vmovups %xmm8, -0x80(%r9)
> +	vmovups %xmm9, -0x70(%r9)
> +	vmovups %xmm10, -0x60(%r9)
> +	vmovups %xmm11, -0x50(%r9)
> +	vmovups %xmm12, -0x40(%r9)
> +	vmovups %xmm13, -0x30(%r9)
> +	vmovups %xmm14, -0x20(%r9)
> +	vmovups %xmm15, -0x10(%r9)
> +	ret
> +
> +L(gobble_data_movsb):
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %rcx

same typo.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]