This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: ling dot ma dot program at gmail dot com
- Cc: libc-alpha at sourceware dot org, liubov dot dmitrieva at gmail dot com, Ling Ma <ling dot ml at alibaba-inc dot com>
- Date: Fri, 11 Apr 2014 00:50:18 +0200
- Subject: Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction
- Authentication-results: sourceware.org; auth=none
- References: <1396595862-21707-1-git-send-email-ling dot ma dot program at gmail dot com>
On Fri, Apr 04, 2014 at 03:17:42AM -0400, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
>
> In this patch we manage to reduce miss branch prediction by
> avoid using branch instructions and force destination to be aligned
> with avx instruction.
>
> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from 2% to 12% or 2% to 21% compared with original memset implemented
> by sse2 and ssse3 respectively.
>
> memcpy-AVX memcpy-SSE2 memcpy-SSSE3 AVX vs SSE2 AVX vs SSSE3
> gcc.166.i 302551459 332189574 345378682 1.097960575 1.141553517
> gcc.200.i 138036144 155904648 168229120 1.129448009 1.218732392
> gcc.cp-decl.i 283963419 296759183 312970805 1.045061311 1.102151841
> gcc.c-typeck.i 616484068 664855801 682119551 1.078463882 1.106467444
> gcc.expr2.i 781639964 858486085 893803320 1.098313961 1.143497468
> gcc.expr.i 580765337 593709446 596005444 1.022288019 1.02624142
> gcc.g23.i 1063726457 1162692750 1177232886 1.093037352 1.106706408
> gcc.s04.i 892109530 948328853 963836294 1.063018409 1.080401298
> gcc.scilab.i 62298843 66606465 72922104 1.069144494 1.170521
>
similar comments as memset here.
> +/* memcpy with AVX
> + Copyright (C) 2014 Free Software Foundation, Inc.
> + Contributed by Alibaba Group.
no contributed by now.
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +#endif
expand to p2align
> + lea (%rsi, %rdx), %r8
> + lea (%rdi, %rdx), %r9
rcx instead r8 saves byte, changing r9 needs some work. Also could you
save something by ymm registers or is that killed by latentcy.
> + cmp $256, %rdx
> + ja L(256bytesormore)
> + cmp $128, %edx
> + jb L(less_128bytes)
> + vmovups (%rsi), %xmm0
> + vmovups 0x10(%rsi), %xmm1
> + vmovups 0x20(%rsi), %xmm2
snip
> + ALIGN(4)
> +L(less_16bytes):
> + cmp $8, %edx
> + jb L(less_8bytes)
> + movq (%rsi), %rcx
> + movq -0x08(%r8), %r10
> + movq %rcx, (%rdi)
> + movq %r10, -0x08(%r9)
rdx instead r10 saves 2 bytes.
> +L(less_4bytes):
> + cmp $2, %edx
> + jb L(less_2bytes)
> + mov (%rsi), %cx
> + mov -0x02(%r8), %dx
> + mov %cx, (%rdi)
> + mov %dx, -0x02(%r9)
> + ret
> + ALIGN(4)
> +L(less_2bytes):
> + cmp $1, %rdx
> + jb L(less_0bytes)
> + mov (%rsi), %cl
> + mov %cl, (%rdi)
> +L(less_0bytes):
> + ret
> +
again you could save comparison here.
> + ALIGN(4)
> +L(256bytesormore):
> +
> +#ifdef USE_AS_MEMMOVE
> + cmp %rsi, %rdi
> + jae L(copy_backward)
> +#endif
this could be unpredictable branch, backward copy only when overlap is
better.
> + mov %rdi, %r10
> + cmp $2048, %rdx
> + jae L(gobble_data_movsb)
> + vmovups -0x80(%r8), %xmm8
> + vmovups -0x70(%r8), %xmm9
> + and $-32, %rdi
> + add $32, %rdi
> + vmovups -0x60(%r8), %xmm10
> + vmovups -0x50(%r8), %xmm11
> + mov %rdi, %r11
> + sub %r10, %r11
> + vmovups -0x40(%r8), %xmm12
> + vmovups -0x30(%r8), %xmm13
> + sub %r11, %rdx
> + vmovups -0x20(%r8), %xmm14
> + vmovups -0x10(%r8), %xmm15
> + vmovups (%rsi), %ymm4
> + add %r11, %rsi
does copying moving vmovups %xmm8, -0x80(%r9)... here help?
Also check if alignment for loop help.
> + sub $0x80, %rdx
> +L(goble_128_loop):
> + vmovups (%rsi), %ymm0
> + vmovups 0x20(%rsi), %ymm1
> + vmovups 0x40(%rsi), %ymm2
> + vmovups 0x60(%rsi), %ymm3
> + lea 0x80(%rsi), %rsi
> + vmovaps %ymm0, (%rdi)
> + vmovaps %ymm1, 0x20(%rdi)
> + vmovaps %ymm2, 0x40(%rdi)
> + vmovaps %ymm3, 0x60(%rdi)
> + lea 0x80(%rdi), %rdi
> + sub $0x80, %rdx
> + jae L(goble_128_loop)
> + vmovups %ymm4, (%r10)
> + vzeroupper
> + vmovups %xmm8, -0x80(%r9)
> + vmovups %xmm9, -0x70(%r9)
> + vmovups %xmm10, -0x60(%r9)
> + vmovups %xmm11, -0x50(%r9)
> + vmovups %xmm12, -0x40(%r9)
> + vmovups %xmm13, -0x30(%r9)
> + vmovups %xmm14, -0x20(%r9)
> + vmovups %xmm15, -0x10(%r9)
> + ret
> +
> +L(gobble_data_movsb):
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> + mov $SHARED_CACHE_SIZE_HALF, %rcx
> +#else
> + mov __x86_shared_cache_size_half(%rip), %rcx
same typo.