This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2] Improve strcpy: Faster unaligned loads.
- From: "Carlos O'Donell" <carlos at redhat dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: Andreas Schwab <schwab at linux-m68k dot org>, libc-alpha at sourceware dot org
- Date: Tue, 10 Sep 2013 10:46:51 -0400
- Subject: Re: [PATCH v2] Improve strcpy: Faster unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz> <20130909161112 dot GB23047 at domone dot kolej dot mff dot cuni dot cz> <mvmbo42dkiq dot fsf at hawking dot suse dot de> <20130909171703 dot GA32141 at domone dot kolej dot mff dot cuni dot cz> <87ob81c1yk dot fsf at igel dot home> <20130909191829 dot GA997 at domone dot kolej dot mff dot cuni dot cz> <522E28E9 dot 5000709 at redhat dot com> <20130910142117 dot GB6536 at domone dot kolej dot mff dot cuni dot cz>
On 09/10/2013 10:21 AM, OndÅej BÃlka wrote:
> Here is v2.
>
> * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S: New implementation.
> * sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: Use new
> implementation.
> * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Do tailcall
> of __strcpy_sse2_unaligned_tail.
ChangeLog doesn't match patch.
Still needs more comments please, see my comments below.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 2 +-
> sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 6 +-
> .../x86_64/multiarch/strcpy-sse2-unaligned-v2.S | 280 +++++++++++++++++++++
> 4 files changed, 287 insertions(+), 3 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..88eaf9d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -14,7 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> - strcpy-sse2-unaligned strncpy-sse2-unaligned \
> + strcpy-sse2-unaligned-v2 strncpy-sse2-unaligned \
OK.
> stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> strcat-sse2-unaligned strncat-sse2-unaligned \
> strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> index 34231f8..363b692 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> @@ -1,3 +1,3 @@
> #define USE_AS_STPCPY
> #define STRCPY __stpcpy_sse2_unaligned
> -#include "strcpy-sse2-unaligned.S"
> +#include "strcpy-sse2-unaligned-v2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..d26db8a 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> @@ -273,7 +273,11 @@ L(StartStrcpyPart):
> test %r8, %r8
> jz L(ExitZero)
> # define USE_AS_STRNCPY
> +# include "strcpy-sse2-unaligned.S"
> +
> +# else
> + jmp __strcpy_sse2_unaligned_tail
> + END (STRCAT)
> # endif
>
> -# include "strcpy-sse2-unaligned.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> new file mode 100644
> index 0000000..ac9ac55
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> @@ -0,0 +1,280 @@
> +/* strcpy with SSE2 and unaligned load
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCPY
> +# define STRCPY __strcpy_sse2_unaligned
> +# endif
> +
> +# define ALIGN(x) .p2align x
> +
> +ENTRY (STRCPY)
> + /* We use basic loop as described in
> + http://sourceware.org/glibc/wiki/Optimizations/string_functions */
This is an excellent writeup. I like that you created a distinct
Optimizations section in the wiki for this and I'm excited to see
more writeups there.
This looks almost done.
I would still like to see comments in the assembly that at least
outline the layout of the code such that someone reading the writuep
can easily follow the code without doing the mental effort of mapping
the code to the templates.
Please post a v3 with inline comments and I'll review it quickly.
> + movq %rsi, %rdx
> + pxor %xmm4, %xmm4
> + movq %rdi, %rax
> + pxor %xmm5, %xmm5
> + andl $4095, %edx
> + pxor %xmm6, %xmm6
> + cmpq $4032, %rdx
> + ja L(cross_page)
> + movdqu (%rsi), %xmm1
> + pxor %xmm7, %xmm7
> + movdqu 16(%rsi), %xmm2
> + pcmpeqb %xmm1, %xmm4
> + pmovmskb %xmm4, %ecx
> + pcmpeqb %xmm2, %xmm5
> + pmovmskb %xmm5, %edx
> + salq $16, %rdx
> + orq %rcx, %rdx
> + jne L(less_32_bytes)
> + movdqu 32(%rsi), %xmm3
> + movdqu 48(%rsi), %xmm4
> + pcmpeqb %xmm3, %xmm6
> + pcmpeqb %xmm4, %xmm7
> + pmovmskb %xmm6, %edx
> + pmovmskb %xmm7, %ecx
> + salq $32, %rdx
> + salq $48, %rcx
> + orq %rcx, %rdx
> + jne less_64_bytes
> + movdqu %xmm1, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> + movdqu %xmm3, 32(%rdi)
> + movdqu %xmm4, 48(%rdi)
> +L(prepare_loop):
> + leaq 64(%rsi), %rdx
> + andq $-64, %rdx
> + addq %rdx, %rdi
> + pxor %xmm5, %xmm5
> + subq %rsi, %rdi
> + movq %rdx, %rsi
> + jmp L(loop_entry)
> +
> + ALIGN (4)
> +L(loop):
> + movdqu %xmm1, (%rdi)
> + addq $64, %rsi
> + movdqu %xmm2, 16(%rdi)
> + movdqu %xmm3, 32(%rdi)
> + movdqu %xmm4, 48(%rdi)
> + addq $64, %rdi
> +L(loop_entry):
> + movdqa 32(%rsi), %xmm3
> + movdqa 48(%rsi), %xmm4
> + movdqa %xmm3, %xmm0
> + movdqa 16(%rsi), %xmm2
> + pminub %xmm4, %xmm0
> + movdqa (%rsi), %xmm1
> + pminub %xmm2, %xmm0
> + pminub %xmm1, %xmm0
> + pcmpeqb %xmm5, %xmm0
> + pmovmskb %xmm0, %edx
> + testq %rdx, %rdx
> + je L(loop)
> +
> + salq $48, %rdx
> + pcmpeqb %xmm1, %xmm5
> + pcmpeqb %xmm2, %xmm6
> + pmovmskb %xmm5, %ecx
> + pmovmskb %xmm6, %r8d
> + pcmpeqb %xmm3, %xmm7
> + orq %rcx, %rdx
> + pmovmskb %xmm7, %r9d
> + salq $16, %r8
> + orq %r8, %rdx
> + salq $32, %r9
> + orq %r9, %rdx
> + bsfq %rdx, %rcx
> +#ifdef USE_AS_STPCPY
> + lea (%rdi, %rcx), %rax
> +#endif
> + cmpq $32, %rcx
> + jb L(less_32_bytes)
> + movdqu -31(%rsi,%rcx), %xmm3
> + movdqu -15(%rsi,%rcx), %xmm4
> + movdqu %xmm1, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> + movdqu %xmm3, -31(%rdi,%rcx)
> + movdqu %xmm4, -15(%rdi,%rcx)
> + ret
> +
> +
> + ALIGN (3)
> +L(between_16_31_bytes):
> + movdqu -15(%rsi,%rdx), %xmm2
> + movdqu %xmm1, (%rdi)
> + movdqu %xmm2, -15(%rdi,%rdx)
> + ret
> +
> + ALIGN (3)
> +L(less_32_bytes):
> + bsfq %rdx, %rdx
> +#ifdef USE_AS_STPCPY
> + lea (%rdi, %rdx), %rax
> +#endif
> + cmpq $15, %rdx
> + jae L(between_16_31_bytes)
> + cmpq $7, %rdx
> + jae L(between_8_15_bytes)
> + cmpq $3, %rdx
> + jae L(between_4_7_bytes)
> + cmpq $1, %rdx
> + jb L(between_1_1_bytes) /* We need to write terminating zero. */
> + movzwl -1(%rsi,%rdx), %ecx
> + movzwl (%rsi), %esi
> + movw %si, (%rdi)
> + movw %cx, -1(%rdi,%rdx)
> + ret
> +
> + ALIGN (3)
> +less_64_bytes:
> + bsfq %rdx, %rdx
> +#ifdef USE_AS_STPCPY
> + lea (%rdi, %rdx), %rax
> +#endif
> + movdqu -31(%rsi,%rdx), %xmm3
> + movdqu -15(%rsi,%rdx), %xmm0
> + movdqu %xmm1, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> + movdqu %xmm3, -31(%rdi,%rdx)
> + movdqu %xmm0, -15(%rdi,%rdx)
> + ret
> +
> + ALIGN (3)
> +L(between_8_15_bytes):
> + movq -7(%rsi,%rdx), %rcx
> + movq (%rsi), %rsi
> + movq %rsi, (%rdi)
> + movq %rcx, -7(%rdi,%rdx)
> + ret
> +
> + ALIGN (3)
> +L(between_4_7_bytes):
> + movl -3(%rsi,%rdx), %ecx
> + movl (%rsi), %esi
> + movl %esi, (%rdi)
> + movl %ecx, -3(%rdi,%rdx)
> + ret
> +
> +L(between_1_1_bytes):
> + movzbl (%rsi), %edx
> + movb %dl, (%rdi)
> + ret
> +
> + ALIGN(4)
> +L(cross_page):
> + movq %rsi, %rcx
> + pxor %xmm0, %xmm0
> + andq $-64, %rcx
> + movabsq $-9223372036854775808, %r10
> + movdqa (%rcx), %xmm4
> + movdqa 16(%rcx), %xmm3
> + pcmpeqb %xmm0, %xmm4
> + movdqa 32(%rcx), %xmm2
> + pcmpeqb %xmm0, %xmm3
> + pmovmskb %xmm4, %edx
> + movdqa 48(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm2
> + pcmpeqb %xmm0, %xmm1
> + orq %r10, %rdx
> + pmovmskb %xmm3, %r10d
> + pmovmskb %xmm2, %r9d
> + salq $16, %r10
> + orq %r10, %rdx
> + pmovmskb %xmm1, %r8d
> + salq $32, %r9
> + orq %r9, %rdx
> + salq $48, %r8
> + orq %r8, %rdx
> + movq %rsi, %r10
> + subq %rcx, %r10
> + movq %r10, %rcx
> + shrq %cl, %rdx
> + bsfq %rdx, %rdx
> +#ifdef USE_AS_STPCPY
> + lea (%rdi, %rdx), %rax
> +#endif
> + cmpq $15, %rdx
> + jbe .L27
> + cmpq $31, %rdx
> + jbe .L28
> + movdqu (%rsi), %xmm3
> + movdqu 16(%rsi), %xmm2
> + movdqu -31(%rsi,%rdx), %xmm1
> + movdqu -15(%rsi,%rdx), %xmm0
> + movdqu %xmm3, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> + movdqu %xmm1, -31(%rdi,%rdx)
> + movdqu %xmm0, -15(%rdi,%rdx)
> +L(copied_cross_page):
> + pxor %xmm4, %xmm4
> + pxor %xmm5, %xmm5
> + pxor %xmm6, %xmm6
> + pxor %xmm7, %xmm7
> + cmpb $0, (%rsi,%rdx)
> + jne L(prepare_loop)
> + ret
> +
> + ALIGN (3)
> +.L27:
> + cmpq $7, %rdx
> + jae .L31
> + cmpq $3, %rdx
> + jae .L32
> + cmpq $1, %rdx
> + jb .L15
> + movzwl (%rsi), %ecx
> + movw %cx, (%rdi)
> + movzwl -1(%rsi,%rdx), %ecx
> + movw %cx, -1(%rdi,%rdx)
> + jmp L(copied_cross_page)
> +
> + ALIGN (3)
> +.L28:
> + movdqu (%rsi), %xmm1
> + movdqu -15(%rsi,%rdx), %xmm0
> + movdqu %xmm1, (%rdi)
> + movdqu %xmm0, -15(%rdi,%rdx)
> + jmp L(copied_cross_page)
> +
> +.L31:
> + movq (%rsi), %r9
> + movq -7(%rsi,%rdx), %rcx
> + movq %r9, (%rdi)
> + movq %rcx, -7(%rdi,%rdx)
> + jmp L(copied_cross_page)
> +
> +.L32:
> + movl (%rsi), %r9d
> + movl -3(%rsi,%rdx), %ecx
> + movl %r9d, (%rdi)
> + movl %ecx, -3(%rdi,%rdx)
> + jmp L(copied_cross_page)
> +
> +.L15:
> + movzbl (%rsi), %ecx
> + movb %cl, (%rdi)
> + jmp L(copied_cross_page)
> +
> +END (STRCPY)
> +
> +#endif
>