This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v2] Improve strcpy: Faster unaligned loads.


On 09/10/2013 10:21 AM, OndÅej BÃlka wrote:
> Here is v2.
> 
> 	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S: New implementation.
> 	* sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: Use new
> 	implementation.
> 	* sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Do tailcall            
> 	of __strcpy_sse2_unaligned_tail.

ChangeLog doesn't match patch.
 
Still needs more comments please, see my comments below.

> ---
>  sysdeps/x86_64/multiarch/Makefile                  |   2 +-
>  sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S   |   2 +-
>  sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S   |   6 +-
>  .../x86_64/multiarch/strcpy-sse2-unaligned-v2.S    | 280 +++++++++++++++++++++
>  4 files changed, 287 insertions(+), 3 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..88eaf9d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -14,7 +14,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>  		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>  		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
> -		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
> +		   strcpy-sse2-unaligned-v2 strncpy-sse2-unaligned \

OK.

>  		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>  		   strcat-sse2-unaligned strncat-sse2-unaligned \
>  		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> index 34231f8..363b692 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> @@ -1,3 +1,3 @@
>  #define USE_AS_STPCPY
>  #define STRCPY __stpcpy_sse2_unaligned
> -#include "strcpy-sse2-unaligned.S"
> +#include "strcpy-sse2-unaligned-v2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..d26db8a 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> @@ -273,7 +273,11 @@ L(StartStrcpyPart):
>  	test	%r8, %r8
>  	jz	L(ExitZero)
>  #  define USE_AS_STRNCPY
> +#  include "strcpy-sse2-unaligned.S"
> +
> +# else
> +	jmp __strcpy_sse2_unaligned_tail
> +  END (STRCAT)
>  # endif
>  
> -# include "strcpy-sse2-unaligned.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> new file mode 100644
> index 0000000..ac9ac55
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> @@ -0,0 +1,280 @@
> +/* strcpy with SSE2 and unaligned load
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +#  ifndef STRCPY
> +#   define STRCPY  __strcpy_sse2_unaligned
> +#  endif
> +
> +# define ALIGN(x) .p2align x
> +
> +ENTRY (STRCPY)
> +	/* We use basic loop as described in 
> +	   http://sourceware.org/glibc/wiki/Optimizations/string_functions */

This is an excellent writeup. I like that you created a distinct
Optimizations section in the wiki for this and I'm excited to see
more writeups there.

This looks almost done.

I would still like to see comments in the assembly that at least
outline the layout of the code such that someone reading the writuep
can easily follow the code without doing the mental effort of mapping
the code to the templates.

Please post a v3 with inline comments and I'll review it quickly.

> +	movq	%rsi, %rdx
> +	pxor	%xmm4, %xmm4
> +	movq	%rdi, %rax
> +	pxor	%xmm5, %xmm5
> +	andl	$4095, %edx
> +	pxor	%xmm6, %xmm6
> +	cmpq	$4032, %rdx
> +	ja	L(cross_page)
> +	movdqu	(%rsi), %xmm1
> +	pxor	%xmm7, %xmm7
> +	movdqu	16(%rsi), %xmm2
> +	pcmpeqb	%xmm1, %xmm4
> +	pmovmskb	%xmm4, %ecx
> +	pcmpeqb	%xmm2, %xmm5
> +	pmovmskb	%xmm5, %edx
> +	salq	$16, %rdx
> +	orq	%rcx, %rdx
> +	jne	L(less_32_bytes)
> +	movdqu	32(%rsi), %xmm3
> +	movdqu	48(%rsi), %xmm4
> +	pcmpeqb	%xmm3, %xmm6
> +	pcmpeqb	%xmm4, %xmm7
> +	pmovmskb	%xmm6, %edx
> +	pmovmskb	%xmm7, %ecx
> +	salq	$32, %rdx
> +	salq	$48, %rcx
> +	orq	%rcx, %rdx
> +	jne	less_64_bytes
> +	movdqu	%xmm1, (%rdi)
> +	movdqu	%xmm2, 16(%rdi)
> +	movdqu	%xmm3, 32(%rdi)
> +	movdqu	%xmm4, 48(%rdi)
> +L(prepare_loop):
> +	leaq	64(%rsi), %rdx
> +	andq	$-64, %rdx
> +	addq	%rdx, %rdi
> +	pxor	%xmm5, %xmm5
> +	subq	%rsi, %rdi
> +	movq	%rdx, %rsi
> +	jmp	L(loop_entry)
> +	
> +	ALIGN (4)
> +L(loop):
> +	movdqu	%xmm1, (%rdi)
> +	addq	$64, %rsi
> +	movdqu	%xmm2, 16(%rdi)
> +	movdqu	%xmm3, 32(%rdi)
> +	movdqu	%xmm4, 48(%rdi)
> +	addq	$64, %rdi
> +L(loop_entry):
> +	movdqa	32(%rsi), %xmm3
> +	movdqa	48(%rsi), %xmm4
> +	movdqa	%xmm3, %xmm0
> +	movdqa	16(%rsi), %xmm2
> +	pminub	%xmm4, %xmm0
> +	movdqa	(%rsi), %xmm1
> +	pminub	%xmm2, %xmm0
> +	pminub	%xmm1, %xmm0
> +	pcmpeqb	%xmm5, %xmm0
> +	pmovmskb	%xmm0, %edx
> +	testq	%rdx, %rdx
> +	je	L(loop)
> +
> +	salq	$48, %rdx
> +	pcmpeqb	%xmm1, %xmm5
> +	pcmpeqb	%xmm2, %xmm6
> +	pmovmskb	%xmm5, %ecx
> +	pmovmskb	%xmm6, %r8d
> +	pcmpeqb	%xmm3, %xmm7
> +	orq	%rcx, %rdx
> +	pmovmskb	%xmm7, %r9d
> +	salq	$16, %r8
> +	orq	%r8, %rdx
> +	salq	$32, %r9
> +	orq	%r9, %rdx
> +	bsfq	%rdx, %rcx
> +#ifdef USE_AS_STPCPY
> +	lea	(%rdi, %rcx), %rax
> +#endif
> +	cmpq	$32, %rcx
> +	jb	L(less_32_bytes)
> +	movdqu	-31(%rsi,%rcx), %xmm3
> +	movdqu	-15(%rsi,%rcx), %xmm4
> +	movdqu	%xmm1, (%rdi)
> +	movdqu	%xmm2, 16(%rdi)
> +	movdqu	%xmm3, -31(%rdi,%rcx)
> +	movdqu	%xmm4, -15(%rdi,%rcx)
> +	ret
> +
> +	
> +	ALIGN (3)
> +L(between_16_31_bytes):
> +	movdqu	-15(%rsi,%rdx), %xmm2
> +	movdqu	%xmm1, (%rdi)
> +	movdqu	%xmm2, -15(%rdi,%rdx)
> +	ret
> +	
> +	ALIGN (3)
> +L(less_32_bytes):
> +	bsfq	%rdx, %rdx
> +#ifdef USE_AS_STPCPY
> +	lea	(%rdi, %rdx), %rax
> +#endif
> +	cmpq	$15, %rdx
> +	jae	L(between_16_31_bytes)
> +	cmpq	$7, %rdx
> +	jae	L(between_8_15_bytes)
> +	cmpq	$3, %rdx
> +	jae	L(between_4_7_bytes)
> +	cmpq	$1, %rdx
> +	jb	L(between_1_1_bytes) /* We need to write terminating zero.  */
> +	movzwl	-1(%rsi,%rdx), %ecx
> +	movzwl	(%rsi), %esi
> +	movw	%si, (%rdi)
> +	movw	%cx, -1(%rdi,%rdx)
> +	ret
> +
> +	ALIGN (3)
> +less_64_bytes:
> +	bsfq	%rdx, %rdx
> +#ifdef USE_AS_STPCPY
> +	lea	(%rdi, %rdx), %rax
> +#endif
> +	movdqu	-31(%rsi,%rdx), %xmm3
> +	movdqu	-15(%rsi,%rdx), %xmm0
> +	movdqu	%xmm1, (%rdi)
> +	movdqu	%xmm2, 16(%rdi)
> +	movdqu	%xmm3, -31(%rdi,%rdx)
> +	movdqu	%xmm0, -15(%rdi,%rdx)
> +	ret
> +
> +	ALIGN (3)
> +L(between_8_15_bytes):
> +	movq	-7(%rsi,%rdx), %rcx
> +	movq	(%rsi), %rsi
> +	movq	%rsi, (%rdi)
> +	movq	%rcx, -7(%rdi,%rdx)
> +	ret
> +
> +	ALIGN (3)
> +L(between_4_7_bytes):
> +	movl	-3(%rsi,%rdx), %ecx
> +	movl	(%rsi), %esi
> +	movl	%esi, (%rdi)
> +	movl	%ecx, -3(%rdi,%rdx)
> +	ret
> +
> +L(between_1_1_bytes):
> +	movzbl	(%rsi), %edx
> +	movb	%dl, (%rdi)
> +	ret
> +
> +	ALIGN(4)	
> +L(cross_page):
> +	movq	%rsi, %rcx
> +	pxor	%xmm0, %xmm0
> +	andq	$-64, %rcx
> +	movabsq	$-9223372036854775808, %r10
> +	movdqa	(%rcx), %xmm4
> +	movdqa	16(%rcx), %xmm3
> +	pcmpeqb	%xmm0, %xmm4
> +	movdqa	32(%rcx), %xmm2
> +	pcmpeqb	%xmm0, %xmm3
> +	pmovmskb	%xmm4, %edx
> +	movdqa	48(%rcx), %xmm1
> +	pcmpeqb	%xmm0, %xmm2
> +	pcmpeqb	%xmm0, %xmm1
> +	orq	%r10, %rdx
> +	pmovmskb	%xmm3, %r10d
> +	pmovmskb	%xmm2, %r9d
> +	salq	$16, %r10
> +	orq	%r10, %rdx
> +	pmovmskb	%xmm1, %r8d
> +	salq	$32, %r9
> +	orq	%r9, %rdx
> +	salq	$48, %r8
> +	orq	%r8, %rdx
> +	movq	%rsi, %r10
> +	subq	%rcx, %r10
> +	movq	%r10, %rcx
> +	shrq	%cl, %rdx
> +	bsfq	%rdx, %rdx
> +#ifdef USE_AS_STPCPY
> +	lea	(%rdi, %rdx), %rax
> +#endif
> +	cmpq	$15, %rdx
> +	jbe	.L27
> +	cmpq	$31, %rdx
> +	jbe	.L28
> +	movdqu	(%rsi), %xmm3
> +	movdqu	16(%rsi), %xmm2
> +	movdqu	-31(%rsi,%rdx), %xmm1
> +	movdqu	-15(%rsi,%rdx), %xmm0
> +	movdqu	%xmm3, (%rdi)
> +	movdqu	%xmm2, 16(%rdi)
> +	movdqu	%xmm1, -31(%rdi,%rdx)
> +	movdqu	%xmm0, -15(%rdi,%rdx)
> +L(copied_cross_page):
> +	pxor	%xmm4, %xmm4
> +	pxor	%xmm5, %xmm5
> +	pxor	%xmm6, %xmm6
> +	pxor	%xmm7, %xmm7
> +	cmpb	$0, (%rsi,%rdx)
> +	jne	L(prepare_loop)
> +	ret
> +
> +	ALIGN (3)
> +.L27:
> +	cmpq	$7, %rdx
> +	jae	.L31
> +	cmpq	$3, %rdx
> +	jae	.L32
> +	cmpq	$1, %rdx
> +	jb	.L15
> +	movzwl	(%rsi), %ecx
> +	movw	%cx, (%rdi)
> +	movzwl	-1(%rsi,%rdx), %ecx
> +	movw	%cx, -1(%rdi,%rdx)
> +	jmp	L(copied_cross_page)
> +	
> +	ALIGN (3)
> +.L28:
> +	movdqu	(%rsi), %xmm1
> +	movdqu	-15(%rsi,%rdx), %xmm0
> +	movdqu	%xmm1, (%rdi)
> +	movdqu	%xmm0, -15(%rdi,%rdx)
> +	jmp	L(copied_cross_page)
> +
> +.L31:
> +	movq	(%rsi), %r9
> +	movq	-7(%rsi,%rdx), %rcx
> +	movq	%r9, (%rdi)
> +	movq	%rcx, -7(%rdi,%rdx)
> +	jmp	L(copied_cross_page)
> +
> +.L32:
> +	movl	(%rsi), %r9d
> +	movl	-3(%rsi,%rdx), %ecx
> +	movl	%r9d, (%rdi)
> +	movl	%ecx, -3(%rdi,%rdx)
> +	jmp	L(copied_cross_page)
> +
> +.L15:
> +	movzbl	(%rsi), %ecx
> +	movb	%cl, (%rdi)
> +	jmp	L(copied_cross_page)
> +
> +END (STRCPY)
> +
> +#endif
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]