This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] Improve strcat


On 09/11/2013 06:23 AM, OndÅej BÃlka wrote:
> On Tue, Sep 10, 2013 at 10:28:44PM +0200, OndÅej BÃlka wrote:
>> Hi Carlos,
>>
>> Here is strcpy with comments. To get structure I decided to include
>> ssse3 loop in this patch. If you are ok with splitting to loop header
>> an ssse3 could be reviewed separately.
>>
>> I ommited actual strcat calls as I have patch that uses them ready and
>> it needs bit of code movement.
>>
> For strcat there was one optimization oppurtunity left - find trailing
> zeros in source and destination in parallel. This patch does exactly
> that.
> 
> This allows us to directly jump to code that copies given amount of
> bytes so I put strcat implementation to file strcpy-sse2-unaligned-v2.S.
> 
> I do not handle strncat yet, so I copied old strcat*.S to strncat*.S
> 
> I did not optimize instruction scheduling yet to make code easier to
> read.
> 
> Results of benchmark are here.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strcat_profile.html
> 
> Comments?

Typo? STRAT? See below.
 
> ---
>  sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S   | 280 +------
>  sysdeps/x86_64/multiarch/strcat-ssse3.S            | 868 +-------------------
>  .../x86_64/multiarch/strcpy-sse2-unaligned-v2.S    | 217 ++++-
>  sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S         |   2 +-
>  sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S  | 285 ++++++-
>  sysdeps/x86_64/multiarch/strncat-ssse3.S           | 869 ++++++++++++++++++++-
>  6 files changed, 1364 insertions(+), 1157 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..03c1f18 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S

Please remind me why we're keeping this file around if we
the implementation is in strcpy-sse2-unaligned-v2.S?

> @@ -1,279 +1 @@
> -/* strcat with SSE2
> -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_sse2_unaligned
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -	mov	%rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -	xor	%rax, %rax
> -	mov	%edi, %ecx
> -	and	$0x3f, %ecx
> -	pxor	%xmm0, %xmm0
> -	cmp	$0x30, %ecx
> -	ja	L(next)
> -	movdqu	(%rdi), %xmm1
> -	pcmpeqb	%xmm1, %xmm0
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit_less16)
> -	mov	%rdi, %rax
> -	and	$-16, %rax
> -	jmp	L(align16_start)
> -L(next):
> -	mov	%rdi, %rax
> -	and	$-16, %rax
> -	pcmpeqb	(%rax), %xmm0
> -	mov	$-1, %r10d
> -	sub	%rax, %rcx
> -	shl	%cl, %r10d
> -	pmovmskb %xmm0, %edx
> -	and	%r10d, %edx
> -	jnz	L(exit)
> -
> -L(align16_start):
> -	pxor	%xmm0, %xmm0
> -	pxor	%xmm1, %xmm1
> -	pxor	%xmm2, %xmm2
> -	pxor	%xmm3, %xmm3
> -	pcmpeqb	16(%rax), %xmm0
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit16)
> -
> -	pcmpeqb	32(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit32)
> -
> -	pcmpeqb	48(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit48)
> -
> -	pcmpeqb	64(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit64)
> -
> -	pcmpeqb	80(%rax), %xmm0
> -	add	$64, %rax
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit16)
> -
> -	pcmpeqb	32(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit32)
> -
> -	pcmpeqb	48(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit48)
> -
> -	pcmpeqb	64(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit64)
> -
> -	pcmpeqb	80(%rax), %xmm0
> -	add	$64, %rax
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit16)
> -
> -	pcmpeqb	32(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit32)
> -
> -	pcmpeqb	48(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit48)
> -
> -	pcmpeqb	64(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit64)
> -
> -	pcmpeqb	80(%rax), %xmm0
> -	add	$64, %rax
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit16)
> -
> -	pcmpeqb	32(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit32)
> -
> -	pcmpeqb	48(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit48)
> -
> -	pcmpeqb	64(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit64)
> -
> -	test	$0x3f, %rax
> -	jz	L(align64_loop)
> -
> -	pcmpeqb	80(%rax), %xmm0
> -	add	$80, %rax
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$0x3f, %rax
> -	jz	L(align64_loop)
> -
> -	pcmpeqb	16(%rax), %xmm1
> -	add	$16, %rax
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$0x3f, %rax
> -	jz	L(align64_loop)
> -
> -	pcmpeqb	16(%rax), %xmm2
> -	add	$16, %rax
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	test	$0x3f, %rax
> -	jz	L(align64_loop)
> -
> -	pcmpeqb	16(%rax), %xmm3
> -	add	$16, %rax
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	jnz	L(exit)
> -
> -	add	$16, %rax
> -	.p2align 4
> -	L(align64_loop):
> -	movaps	(%rax),	%xmm4
> -	pminub	16(%rax),	%xmm4
> -	movaps	32(%rax),	%xmm5
> -	pminub	48(%rax),	%xmm5
> -	add	$64,	%rax
> -	pminub	%xmm4,	%xmm5
> -	pcmpeqb	%xmm0,	%xmm5
> -	pmovmskb %xmm5,	%edx
> -	test	%edx,	%edx
> -	jz	L(align64_loop)
> -
> -	pcmpeqb	-64(%rax), %xmm0
> -	sub	$80,	%rax
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	jnz	L(exit16)
> -
> -	pcmpeqb	32(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	jnz	L(exit32)
> -
> -	pcmpeqb	48(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	jnz	L(exit48)
> -
> -	pcmpeqb	64(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$64, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit):
> -	sub	%rdi, %rax
> -L(exit_less16):
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit16):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$16, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit32):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$32, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit48):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$48, %rax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit64):
> -	sub	%rdi, %rax
> -	bsf	%rdx, %rdx
> -	add	%rdx, %rax
> -	add	$64, %rax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	lea	(%r9, %rax), %rdi
> -	mov	%rsi, %rcx
> -	mov	%r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-sse2-unaligned.S"
> -#endif
> +/* Implemented in strcpy-sse2-unaligned-v2.S  */
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> index 8101b91..fd5fba7 100644
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> @@ -1,867 +1 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> -	mov	%rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -	xor	%eax, %eax
> -	cmpb	$0, (%rdi)
> -	jz	L(exit_tail0)
> -	cmpb	$0, 1(%rdi)
> -	jz	L(exit_tail1)
> -	cmpb	$0, 2(%rdi)
> -	jz	L(exit_tail2)
> -	cmpb	$0, 3(%rdi)
> -	jz	L(exit_tail3)
> -
> -	cmpb	$0, 4(%rdi)
> -	jz	L(exit_tail4)
> -	cmpb	$0, 5(%rdi)
> -	jz	L(exit_tail5)
> -	cmpb	$0, 6(%rdi)
> -	jz	L(exit_tail6)
> -	cmpb	$0, 7(%rdi)
> -	jz	L(exit_tail7)
> -
> -	cmpb	$0, 8(%rdi)
> -	jz	L(exit_tail8)
> -	cmpb	$0, 9(%rdi)
> -	jz	L(exit_tail9)
> -	cmpb	$0, 10(%rdi)
> -	jz	L(exit_tail10)
> -	cmpb	$0, 11(%rdi)
> -	jz	L(exit_tail11)
> -
> -	cmpb	$0, 12(%rdi)
> -	jz	L(exit_tail12)
> -	cmpb	$0, 13(%rdi)
> -	jz	L(exit_tail13)
> -	cmpb	$0, 14(%rdi)
> -	jz	L(exit_tail14)
> -	cmpb	$0, 15(%rdi)
> -	jz	L(exit_tail15)
> -	pxor	%xmm0, %xmm0
> -	lea	16(%rdi), %rcx
> -	lea	16(%rdi), %rax
> -	and	$-16, %rax
> -
> -	pcmpeqb	(%rax), %xmm0
> -	pmovmskb %xmm0, %edx
> -	pxor	%xmm1, %xmm1
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	pxor	%xmm2, %xmm2
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	pxor	%xmm3, %xmm3
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm0
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm0
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm0
> -	pmovmskb %xmm0, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm1
> -	pmovmskb %xmm1, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm2
> -	pmovmskb %xmm2, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	pcmpeqb	(%rax), %xmm3
> -	pmovmskb %xmm3, %edx
> -	test	%edx, %edx
> -	lea	16(%rax), %rax
> -	jnz	L(exit)
> -
> -	and	$-0x40, %rax
> -
> -	.p2align 4
> -L(aligned_64):
> -	pcmpeqb	(%rax), %xmm0
> -	pcmpeqb	16(%rax), %xmm1
> -	pcmpeqb	32(%rax), %xmm2
> -	pcmpeqb	48(%rax), %xmm3
> -	pmovmskb %xmm0, %edx
> -	pmovmskb %xmm1, %r11d
> -	pmovmskb %xmm2, %r10d
> -	pmovmskb %xmm3, %r9d
> -	or	%edx, %r9d
> -	or	%r11d, %r9d
> -	or	%r10d, %r9d
> -	lea	64(%rax), %rax
> -	jz	L(aligned_64)
> -
> -	test	%edx, %edx
> -	jnz	L(aligned_64_exit_16)
> -	test	%r11d, %r11d
> -	jnz	L(aligned_64_exit_32)
> -	test	%r10d, %r10d
> -	jnz	L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> -	pmovmskb %xmm3, %edx
> -	jmp	L(exit)
> -
> -L(aligned_64_exit_48):
> -	lea	-16(%rax), %rax
> -	mov	%r10d, %edx
> -	jmp	L(exit)
> -
> -L(aligned_64_exit_32):
> -	lea	-32(%rax), %rax
> -	mov	%r11d, %edx
> -	jmp	L(exit)
> -
> -L(aligned_64_exit_16):
> -	lea	-48(%rax), %rax
> -
> -L(exit):
> -	sub	%rcx, %rax
> -	test	%dl, %dl
> -	jz	L(exit_high)
> -	test	$0x01, %dl
> -	jnz	L(exit_tail0)
> -
> -	test	$0x02, %dl
> -	jnz	L(exit_tail1)
> -
> -	test	$0x04, %dl
> -	jnz	L(exit_tail2)
> -
> -	test	$0x08, %dl
> -	jnz	L(exit_tail3)
> -
> -	test	$0x10, %dl
> -	jnz	L(exit_tail4)
> -
> -	test	$0x20, %dl
> -	jnz	L(exit_tail5)
> -
> -	test	$0x40, %dl
> -	jnz	L(exit_tail6)
> -	add	$7, %eax
> -L(exit_tail0):
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_high):
> -	add	$8, %eax
> -	test	$0x01, %dh
> -	jnz	L(exit_tail0)
> -
> -	test	$0x02, %dh
> -	jnz	L(exit_tail1)
> -
> -	test	$0x04, %dh
> -	jnz	L(exit_tail2)
> -
> -	test	$0x08, %dh
> -	jnz	L(exit_tail3)
> -
> -	test	$0x10, %dh
> -	jnz	L(exit_tail4)
> -
> -	test	$0x20, %dh
> -	jnz	L(exit_tail5)
> -
> -	test	$0x40, %dh
> -	jnz	L(exit_tail6)
> -	add	$7, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail1):
> -	add	$1, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail2):
> -	add	$2, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail3):
> -	add	$3, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail4):
> -	add	$4, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail5):
> -	add	$5, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail6):
> -	add	$6, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail7):
> -	add	$7, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail8):
> -	add	$8, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail9):
> -	add	$9, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail10):
> -	add	$10, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail11):
> -	add	$11, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail12):
> -	add	$12, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail13):
> -	add	$13, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail14):
> -	add	$14, %eax
> -	jmp	L(StartStrcpyPart)
> -
> -	.p2align 4
> -L(exit_tail15):
> -	add	$15, %eax
> -
> -	.p2align 4
> -L(StartStrcpyPart):
> -	mov	%rsi, %rcx
> -	lea	(%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> -	test	%r8, %r8
> -	jz	L(StrncatExit0)
> -	cmp	$8, %r8
> -	jbe	L(StrncatExit8Bytes)
> -# endif
> -	cmpb	$0, (%rcx)
> -	jz	L(Exit1)
> -	cmpb	$0, 1(%rcx)
> -	jz	L(Exit2)
> -	cmpb	$0, 2(%rcx)
> -	jz	L(Exit3)
> -	cmpb	$0, 3(%rcx)
> -	jz	L(Exit4)
> -	cmpb	$0, 4(%rcx)
> -	jz	L(Exit5)
> -	cmpb	$0, 5(%rcx)
> -	jz	L(Exit6)
> -	cmpb	$0, 6(%rcx)
> -	jz	L(Exit7)
> -	cmpb	$0, 7(%rcx)
> -	jz	L(Exit8)
> -	cmpb	$0, 8(%rcx)
> -	jz	L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -	cmp	$16, %r8
> -	jb	L(StrncatExit15Bytes)
> -# endif
> -	cmpb	$0, 9(%rcx)
> -	jz	L(Exit10)
> -	cmpb	$0, 10(%rcx)
> -	jz	L(Exit11)
> -	cmpb	$0, 11(%rcx)
> -	jz	L(Exit12)
> -	cmpb	$0, 12(%rcx)
> -	jz	L(Exit13)
> -	cmpb	$0, 13(%rcx)
> -	jz	L(Exit14)
> -	cmpb	$0, 14(%rcx)
> -	jz	L(Exit15)
> -	cmpb	$0, 15(%rcx)
> -	jz	L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -	cmp	$16, %r8
> -	je	L(StrncatExit16)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> -	.p2align 4
> -L(CopyFrom1To16Bytes):
> -	add	%rsi, %rdx
> -	add	%rsi, %rcx
> -
> -	test	%al, %al
> -	jz	L(ExitHigh)
> -	test	$0x01, %al
> -	jnz	L(Exit1)
> -	test	$0x02, %al
> -	jnz	L(Exit2)
> -	test	$0x04, %al
> -	jnz	L(Exit3)
> -	test	$0x08, %al
> -	jnz	L(Exit4)
> -	test	$0x10, %al
> -	jnz	L(Exit5)
> -	test	$0x20, %al
> -	jnz	L(Exit6)
> -	test	$0x40, %al
> -	jnz	L(Exit7)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(ExitHigh):
> -	test	$0x01, %ah
> -	jnz	L(Exit9)
> -	test	$0x02, %ah
> -	jnz	L(Exit10)
> -	test	$0x04, %ah
> -	jnz	L(Exit11)
> -	test	$0x08, %ah
> -	jnz	L(Exit12)
> -	test	$0x10, %ah
> -	jnz	L(Exit13)
> -	test	$0x20, %ah
> -	jnz	L(Exit14)
> -	test	$0x40, %ah
> -	jnz	L(Exit15)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	8(%rcx), %xmm1
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	%xmm1, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit1):
> -	xor	%ah, %ah
> -	movb	%ah, 1(%rdx)
> -L(Exit1):
> -	movb	(%rcx), %al
> -	movb	%al, (%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit2):
> -	xor	%ah, %ah
> -	movb	%ah, 2(%rdx)
> -L(Exit2):
> -	movw	(%rcx), %ax
> -	movw	%ax, (%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit3):
> -	xor	%ah, %ah
> -	movb	%ah, 3(%rdx)
> -L(Exit3):
> -	movw	(%rcx), %ax
> -	movw	%ax, (%rdx)
> -	movb	2(%rcx), %al
> -	movb	%al, 2(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit4):
> -	xor	%ah, %ah
> -	movb	%ah, 4(%rdx)
> -L(Exit4):
> -	mov	(%rcx), %eax
> -	mov	%eax, (%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit5):
> -	xor	%ah, %ah
> -	movb	%ah, 5(%rdx)
> -L(Exit5):
> -	mov	(%rcx), %eax
> -	mov	%eax, (%rdx)
> -	movb	4(%rcx), %al
> -	movb	%al, 4(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit6):
> -	xor	%ah, %ah
> -	movb	%ah, 6(%rdx)
> -L(Exit6):
> -	mov	(%rcx), %eax
> -	mov	%eax, (%rdx)
> -	movw	4(%rcx), %ax
> -	movw	%ax, 4(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit7):
> -	xor	%ah, %ah
> -	movb	%ah, 7(%rdx)
> -L(Exit7):
> -	mov	(%rcx), %eax
> -	mov	%eax, (%rdx)
> -	mov	3(%rcx), %eax
> -	mov	%eax, 3(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit8):
> -	xor	%ah, %ah
> -	movb	%ah, 8(%rdx)
> -L(Exit8):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit9):
> -	xor	%ah, %ah
> -	movb	%ah, 9(%rdx)
> -L(Exit9):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movb	8(%rcx), %al
> -	movb	%al, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit10):
> -	xor	%ah, %ah
> -	movb	%ah, 10(%rdx)
> -L(Exit10):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movw	8(%rcx), %ax
> -	movw	%ax, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit11):
> -	xor	%ah, %ah
> -	movb	%ah, 11(%rdx)
> -L(Exit11):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	mov	7(%rcx), %eax
> -	mov	%eax, 7(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit12):
> -	xor	%ah, %ah
> -	movb	%ah, 12(%rdx)
> -L(Exit12):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	mov	8(%rcx), %eax
> -	mov	%eax, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit13):
> -	xor	%ah, %ah
> -	movb	%ah, 13(%rdx)
> -L(Exit13):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	5(%rcx), %xmm1
> -	movlpd	%xmm1, 5(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit14):
> -	xor	%ah, %ah
> -	movb	%ah, 14(%rdx)
> -L(Exit14):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	6(%rcx), %xmm1
> -	movlpd	%xmm1, 6(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit15):
> -	xor	%ah, %ah
> -	movb	%ah, 15(%rdx)
> -L(Exit15):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	7(%rcx), %xmm1
> -	movlpd	%xmm1, 7(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit16):
> -	xor	%ah, %ah
> -	movb	%ah, 16(%rdx)
> -L(Exit16):
> -	movlpd	(%rcx), %xmm0
> -	movlpd	8(%rcx), %xmm1
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	%xmm1, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> -	.p2align 4
> -L(CopyFrom1To16BytesCase2):
> -	add	$16, %r8
> -	add	%rsi, %rcx
> -	lea	(%rsi, %rdx), %rsi
> -	lea	-9(%r8), %rdx
> -	and	$1<<7, %dh
> -	or	%al, %dh
> -	test	%dh, %dh
> -	lea	(%rsi), %rdx
> -	jz	L(ExitHighCase2)
> -
> -	test	$0x01, %al
> -	jnz	L(Exit1)
> -	cmp	$1, %r8
> -	je	L(StrncatExit1)
> -	test	$0x02, %al
> -	jnz	L(Exit2)
> -	cmp	$2, %r8
> -	je	L(StrncatExit2)
> -	test	$0x04, %al
> -	jnz	L(Exit3)
> -	cmp	$3, %r8
> -	je	L(StrncatExit3)
> -	test	$0x08, %al
> -	jnz	L(Exit4)
> -	cmp	$4, %r8
> -	je	L(StrncatExit4)
> -	test	$0x10, %al
> -	jnz	L(Exit5)
> -	cmp	$5, %r8
> -	je	L(StrncatExit5)
> -	test	$0x20, %al
> -	jnz	L(Exit6)
> -	cmp	$6, %r8
> -	je	L(StrncatExit6)
> -	test	$0x40, %al
> -	jnz	L(Exit7)
> -	cmp	$7, %r8
> -	je	L(StrncatExit7)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	lea	7(%rdx), %rax
> -	cmpb	$1, (%rax)
> -	sbb	$-1, %rax
> -	xor	%cl, %cl
> -	movb	%cl, (%rax)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(ExitHighCase2):
> -	test	$0x01, %ah
> -	jnz	L(Exit9)
> -	cmp	$9, %r8
> -	je	L(StrncatExit9)
> -	test	$0x02, %ah
> -	jnz	L(Exit10)
> -	cmp	$10, %r8
> -	je	L(StrncatExit10)
> -	test	$0x04, %ah
> -	jnz	L(Exit11)
> -	cmp	$11, %r8
> -	je	L(StrncatExit11)
> -	test	$0x8, %ah
> -	jnz	L(Exit12)
> -	cmp	$12, %r8
> -	je	L(StrncatExit12)
> -	test	$0x10, %ah
> -	jnz	L(Exit13)
> -	cmp	$13, %r8
> -	je	L(StrncatExit13)
> -	test	$0x20, %ah
> -	jnz	L(Exit14)
> -	cmp	$14, %r8
> -	je	L(StrncatExit14)
> -	test	$0x40, %ah
> -	jnz	L(Exit15)
> -	cmp	$15, %r8
> -	je	L(StrncatExit15)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	8(%rcx), %xmm1
> -	movlpd	%xmm1, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -	test	%rax, %rax
> -	jnz	L(CopyFrom1To16BytesCase2)
> -
> -	.p2align 4
> -L(CopyFrom1To16BytesCase3):
> -	add	$16, %r8
> -	add	%rsi, %rdx
> -	add	%rsi, %rcx
> -
> -	cmp	$8, %r8
> -	ja	L(ExitHighCase3)
> -	cmp	$1, %r8
> -	je	L(StrncatExit1)
> -	cmp	$2, %r8
> -	je	L(StrncatExit2)
> -	cmp	$3, %r8
> -	je	L(StrncatExit3)
> -	cmp	$4, %r8
> -	je	L(StrncatExit4)
> -	cmp	$5, %r8
> -	je	L(StrncatExit5)
> -	cmp	$6, %r8
> -	je	L(StrncatExit6)
> -	cmp	$7, %r8
> -	je	L(StrncatExit7)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	xor	%ah, %ah
> -	movb	%ah, 8(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(ExitHighCase3):
> -	cmp	$9, %r8
> -	je	L(StrncatExit9)
> -	cmp	$10, %r8
> -	je	L(StrncatExit10)
> -	cmp	$11, %r8
> -	je	L(StrncatExit11)
> -	cmp	$12, %r8
> -	je	L(StrncatExit12)
> -	cmp	$13, %r8
> -	je	L(StrncatExit13)
> -	cmp	$14, %r8
> -	je	L(StrncatExit14)
> -	cmp	$15, %r8
> -	je	L(StrncatExit15)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	8(%rcx), %xmm1
> -	movlpd	%xmm1, 8(%rdx)
> -	xor	%ah, %ah
> -	movb	%ah, 16(%rdx)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit0):
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit15Bytes):
> -	cmp	$9, %r8
> -	je	L(StrncatExit9)
> -	cmpb	$0, 9(%rcx)
> -	jz	L(Exit10)
> -	cmp	$10, %r8
> -	je	L(StrncatExit10)
> -	cmpb	$0, 10(%rcx)
> -	jz	L(Exit11)
> -	cmp	$11, %r8
> -	je	L(StrncatExit11)
> -	cmpb	$0, 11(%rcx)
> -	jz	L(Exit12)
> -	cmp	$12, %r8
> -	je	L(StrncatExit12)
> -	cmpb	$0, 12(%rcx)
> -	jz	L(Exit13)
> -	cmp	$13, %r8
> -	je	L(StrncatExit13)
> -	cmpb	$0, 13(%rcx)
> -	jz	L(Exit14)
> -	cmp	$14, %r8
> -	je	L(StrncatExit14)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	movlpd	7(%rcx), %xmm1
> -	movlpd	%xmm1, 7(%rdx)
> -	lea	14(%rdx), %rax
> -	cmpb	$1, (%rax)
> -	sbb	$-1, %rax
> -	xor	%cl, %cl
> -	movb	%cl, (%rax)
> -	mov	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(StrncatExit8Bytes):
> -	cmpb	$0, (%rcx)
> -	jz	L(Exit1)
> -	cmp	$1, %r8
> -	je	L(StrncatExit1)
> -	cmpb	$0, 1(%rcx)
> -	jz	L(Exit2)
> -	cmp	$2, %r8
> -	je	L(StrncatExit2)
> -	cmpb	$0, 2(%rcx)
> -	jz	L(Exit3)
> -	cmp	$3, %r8
> -	je	L(StrncatExit3)
> -	cmpb	$0, 3(%rcx)
> -	jz	L(Exit4)
> -	cmp	$4, %r8
> -	je	L(StrncatExit4)
> -	cmpb	$0, 4(%rcx)
> -	jz	L(Exit5)
> -	cmp	$5, %r8
> -	je	L(StrncatExit5)
> -	cmpb	$0, 5(%rcx)
> -	jz	L(Exit6)
> -	cmp	$6, %r8
> -	je	L(StrncatExit6)
> -	cmpb	$0, 6(%rcx)
> -	jz	L(Exit7)
> -	cmp	$7, %r8
> -	je	L(StrncatExit7)
> -	movlpd	(%rcx), %xmm0
> -	movlpd	%xmm0, (%rdx)
> -	lea	7(%rdx), %rax
> -	cmpb	$1, (%rax)
> -	sbb	$-1, %rax
> -	xor	%cl, %cl
> -	movb	%cl, (%rax)
> -	mov	%rdi, %rax
> -	ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> +/* Implemented in strcpy-ssse3-v2.S  */

Same question here... why keep this around if the -v2
file is going to have the new implementation?

I apologize if we already covered this, but I can't
remember.

> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> index 9725857..77b9adb 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> @@ -21,18 +21,218 @@
>  # include <sysdep.h>
>  
>  # ifndef STRCPY
> -#  define STRCPY_TAIL	__strcpy_sse2_unaligned_tail
>  #  define STRCPY	__strcpy_sse2_unaligned
> +#  define STRCAT	__strcat_sse2_unaligned
>  # endif
>  
>  # define ALIGN(x) .p2align x
>  
>  #ifndef USE_AS_STPCPY
> -ENTRY (STRCPY_TAIL)
> -	movq  %rsi, %rdx
> -	pxor  %xmm4, %xmm4
> -	jmp L(from_tail)
> -END (STRCPY_TAIL)
> +     /* Optimized strcat, we we optimize path when src and dest are at
> +	most 64 bytes large by finding terminating zero in parallel. 
> +
> +	  if ((src | dest) % 4096 > 4096 - 64) 
> +	    goto strcat_cross_page;
> +	  d = zero64 (dest)
> +          s = zero64 (src)
> +	  if (d) 
> +            {
> +	      found:;
> +	      dest += ffs (d)
> +	      if (s)
> +		return copy_less64_bytes (dest, src, ffs (s))
> +	      copy64_bytes (dest, src)
> +	      return strcpy (dest, src)
> +	    } 
> +          else 
> +            {
> +	      dest = ALIGN_DOWN (dest,64);
> +	      while (1) 
> +		{
> +	          dest += 64;
> +	          d = zero64 (dest);
> +	          if (d)
> +		    goto found;
> +	        }
> +            }
> +	 */

Move end of comment up a line please.

> +ENTRY (STRCAT)
> +	movq	%rdi, %rax
> +	pxor	%xmm5, %xmm5
> +	movq	%rsi, %rdx
> +        pxor	%xmm6, %xmm6
> +	orq	%rdi, %rdx
> +        pxor	%xmm7, %xmm7
> +	andl	$4095, %edx
> +	pxor	%xmm0, %xmm0
> +        cmpq	$4032, %rdx
> +	ja	L(strcat_cross_page)
> +	movdqu	(%rsi), %xmm1
> +	movdqu	16(%rsi), %xmm2
> +	movdqu	32(%rsi), %xmm3
> +	movdqu	48(%rsi), %xmm4
> +        pxor	%xmm8, %xmm8
> +	movdqu	(%rdi), %xmm9
> +	movdqu	16(%rdi), %xmm10
> +	movdqu	32(%rdi), %xmm11
> +	movdqu	48(%rdi), %xmm12
> +	pcmpeqb	%xmm1, %xmm5
> +	pcmpeqb	%xmm2, %xmm6
> +	pcmpeqb %xmm3, %xmm7
> +	pcmpeqb %xmm4, %xmm0
> +	pcmpeqb %xmm8, %xmm9
> +	pcmpeqb %xmm8, %xmm10
> +	pcmpeqb %xmm8, %xmm11
> +	pcmpeqb %xmm8, %xmm12
> +
> +	pmovmskb %xmm5, %edx
> +	pmovmskb %xmm6, %r8d
> +	pmovmskb %xmm7, %r9d
> +	pmovmskb %xmm0, %r10d
> +	shlq	$16,	%r8
> +	shlq	$32,	%r9
> +	shlq	$48,	%r10
> +	orq	%r8,	%rdx
> +	orq	%r9,	%rdx
> +	orq	%r10,	%rdx
> +
> +	pmovmskb %xmm9, %ecx
> + 	pmovmskb %xmm10, %r11d
> +        pmovmskb %xmm11, %r10d
> +        pmovmskb %xmm12, %r9d
> +	shlq	$16,	%r11
> +	shlq	$32,	%r10
> +	shlq	$48,	%r9
> +	orq	%r11,	%rcx
> +	orq	%r10,	%rcx
> +	orq	%r9,	%rcx
> +
> +	testq	%rcx, %rcx
> +	je	L(strcat_loop_start)
> +L(strcat_found_zero):
> +	bsfq	%rcx, %rcx
> +	addq	%rcx, %rdi
> +	testq	%rdx, %rdx
> +	jne	L(last_64_bytes)
> +	jmp	L(strcat_first_64_bytes)
> +
> +L(strcat_loop_start):
> +	andq	$-64, %rdi
> +	ALIGN(4)	
> +L(strcat_loop):
> +	movdqa	64(%rdi), %xmm5
> +	pminub	80(%rdi), %xmm5
> +	pminub	96(%rdi), %xmm5
> +	pminub	112(%rdi), %xmm5
> +	addq	$64, %rdi
> +	pcmpeqb  %xmm8, %xmm5
> +	pmovmskb %xmm5, %ecx
> +	testl	%ecx, %ecx
> +	je	L(strcat_loop)
> +	shlq	$48, %rcx
> +	movdqa	(%rdi), %xmm9
> +	movdqa	16(%rdi), %xmm10
> +	movdqa	32(%rdi), %xmm11
> +	pcmpeqb	%xmm8, %xmm9
> +	pcmpeqb	%xmm8, %xmm10
> +	pcmpeqb	%xmm8, %xmm11
> +	pmovmskb	%xmm9, %r9d
> +	pmovmskb	%xmm10, %r10d
> +	pmovmskb	%xmm11, %r11d
> +	salq	$16, %r10
> +	salq	$32, %r11
> +	orq	%r9, %rcx
> +	orq	%r10, %rcx
> +	orq	%r11, %rcx
> +	jmp	L(strcat_found_zero)	
> +
> +	/* 

Move start of comment up a line please.

> +	  d_al = ALIGN_DOWN (dest, 64);
> +	  d = zero64 (d_al)
> +	  d = d >> (dest - d_al)
> +	  if (d) 
> +            {
> +	      dest += ffs (d)
> +	      return strcpy (dest, src)
> +	    } 
> +          else 
> +            {
> +	      dest = ALIGN_DOWN (dest,64);
> +	      while (1) 
> +                {
> +	          dest += 64;
> +	          d = zero64 (dest);
> +	          if (d) {
> +                    dest += ffs (d)
> +	            return strcpy (dest, src)
> +                  }
> +	        }
> +            } */
> +	L(strcat_cross_page):
> +	andq	$-64, %rdi
> +
> +	movdqa	48(%rdi), %xmm12
> +	pcmpeqb	%xmm8, %xmm12
> +	pmovmskb %xmm12, %rcx
> +	shlq	$48, %rcx
> +	movdqa	(%rdi), %xmm9
> +	movdqa	16(%rdi), %xmm10
> +	movdqa	32(%rdi), %xmm11
> +	pcmpeqb	%xmm8, %xmm9
> +	pcmpeqb	%xmm8, %xmm10
> +	pcmpeqb	%xmm8, %xmm11
> +	pmovmskb	%xmm9, %r9d
> +	pmovmskb	%xmm10, %r10d
> +	pmovmskb	%xmm11, %r11d
> +	salq	$16, %r10
> +	salq	$32, %r11
> +	orq	%r9, %rcx
> +	orq	%r10, %rcx
> +	orq	%r11, %rcx
> +	movq	%rcx, %rdx
> +	movq	%rax, %rcx
> +	shrq	%cl, %rdx /* We use fact that shifts are done modulo 64.  */

s/use/use the/g

> +	testq	%rdx, %rdx
> +	je	L(strcat_cross_loop)
> +	movq	%rax, %rdi
> +
> +	pxor	%xmm4, %xmm4
> +	bsfq	%rdx, %rcx
> +	addq	%rcx, %rdi
> +	jmp	L(from_strcat)
> +
> +	ALIGN(4)	
> +L(strcat_cross_loop):
> +	movdqa	64(%rdi), %xmm5
> +	pminub	80(%rdi), %xmm5
> +	pminub	96(%rdi), %xmm5
> +	pminub	112(%rdi), %xmm5
> +	addq	$64, %rdi
> +	pcmpeqb	%xmm8, %xmm5
> +	pmovmskb	%xmm5, %ecx
> +	testl	%ecx, %ecx
> +	je	L(strcat_cross_loop)
> +	shlq	$48, %rcx
> +	movdqa	(%rdi), %xmm9
> +	movdqa	16(%rdi), %xmm10
> +	movdqa	32(%rdi), %xmm11
> +	pcmpeqb	%xmm8, %xmm9
> +	pcmpeqb	%xmm8, %xmm10
> +	pcmpeqb	%xmm8, %xmm11
> +	pmovmskb	%xmm9, %r9d
> +	pmovmskb	%xmm10, %r10d
> +	pmovmskb	%xmm11, %r11d
> +	salq	$16, %r10
> +	salq	$32, %r11
> +	orq	%r9, %rcx
> +	orq	%r10, %rcx
> +	orq	%r11, %rcx
> +
> +	pxor	%xmm4, %xmm4
> +	bsfq	%rcx, %rcx
> +	addq	%rcx, %rdi
> +	jmp	L(from_strcat)
> +END (STRCAT)
>  #endif
>  
>  ENTRY (STRCPY)
> @@ -51,7 +251,7 @@ ENTRY (STRCPY)
>  	movq	%rsi, %rdx
>  	pxor	%xmm4, %xmm4
>  	movq	%rdi, %rax
> -L(from_tail):
> +L(from_strcat):
>  	pxor	%xmm5, %xmm5
>  	andl	$4095, %edx
>  	pxor	%xmm6, %xmm6
> @@ -88,6 +288,7 @@ L(from_tail):
>  	salq	$48, %rcx
>  	orq	%rcx, %rdx
>  	jne	L(between_32_64_bytes)
> +L(strcat_first_64_bytes):
>  	movdqu	%xmm1, (%rdi)
>  	movdqu	%xmm2, 16(%rdi)
>  	movdqu	%xmm3, 32(%rdi)
> @@ -137,7 +338,7 @@ L(prepare_loop):
>       /* After loop finished we call following
>  	copy_less64_bytes (erdi, ersi, ffs(erdx) + 1);
>          return; */
> -
> +L(last_64_bytes):
>  	bsfq	%rdx, %rcx
>  #ifdef USE_AS_STPCPY
>  	lea	(%rdi, %rcx), %rax
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> index 8f70c42..9705e53 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> @@ -19,8 +19,8 @@
>  #define USE_SSSE3
>  
>  #ifndef STRCPY
> -# define STRCPY_TAIL	__strcpy_ssse3_tail
>  # define STRCPY	__strcpy_ssse3
> +# define STRAT	__strcat_ssse3

Typo? Needs testing?

>  #endif
>  
>  #include "strcpy-sse2-unaligned-v2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> index 133e1d2..34dd69b 100644
> --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> @@ -1,3 +1,286 @@
> +/* strncat with SSE2
> +   Copyright (C) 2011-2013 Free Software Foundation, Inc.

How did you come up with the copyright years?

Was it from the original file you inlined?

> +   Contributed by Intel Corporation.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
>  #define USE_AS_STRNCAT
>  #define STRCAT __strncat_sse2_unaligned
> -#include "strcat-sse2-unaligned.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +#  define STRCAT  __strcat_sse2_unaligned
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> +	mov	%rdi, %r9
> +# ifdef USE_AS_STRNCAT
> +	mov	%rdx, %r8
> +# endif
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> +   implementation gets merged.  */
> +
> +	xor	%rax, %rax
> +	mov	%edi, %ecx
> +	and	$0x3f, %ecx
> +	pxor	%xmm0, %xmm0
> +	cmp	$0x30, %ecx
> +	ja	L(next)
> +	movdqu	(%rdi), %xmm1
> +	pcmpeqb	%xmm1, %xmm0
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit_less16)
> +	mov	%rdi, %rax
> +	and	$-16, %rax
> +	jmp	L(align16_start)
> +L(next):
> +	mov	%rdi, %rax
> +	and	$-16, %rax
> +	pcmpeqb	(%rax), %xmm0
> +	mov	$-1, %r10d
> +	sub	%rax, %rcx
> +	shl	%cl, %r10d
> +	pmovmskb %xmm0, %edx
> +	and	%r10d, %edx
> +	jnz	L(exit)
> +
> +L(align16_start):
> +	pxor	%xmm0, %xmm0
> +	pxor	%xmm1, %xmm1
> +	pxor	%xmm2, %xmm2
> +	pxor	%xmm3, %xmm3
> +	pcmpeqb	16(%rax), %xmm0
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit16)
> +
> +	pcmpeqb	32(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit32)
> +
> +	pcmpeqb	48(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit48)
> +
> +	pcmpeqb	64(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	jnz	L(exit64)
> +
> +	pcmpeqb	80(%rax), %xmm0
> +	add	$64, %rax
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit16)
> +
> +	pcmpeqb	32(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit32)
> +
> +	pcmpeqb	48(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit48)
> +
> +	pcmpeqb	64(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	jnz	L(exit64)
> +
> +	pcmpeqb	80(%rax), %xmm0
> +	add	$64, %rax
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit16)
> +
> +	pcmpeqb	32(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit32)
> +
> +	pcmpeqb	48(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit48)
> +
> +	pcmpeqb	64(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	jnz	L(exit64)
> +
> +	pcmpeqb	80(%rax), %xmm0
> +	add	$64, %rax
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit16)
> +
> +	pcmpeqb	32(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit32)
> +
> +	pcmpeqb	48(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit48)
> +
> +	pcmpeqb	64(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	jnz	L(exit64)
> +
> +	test	$0x3f, %rax
> +	jz	L(align64_loop)
> +
> +	pcmpeqb	80(%rax), %xmm0
> +	add	$80, %rax
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit)
> +
> +	test	$0x3f, %rax
> +	jz	L(align64_loop)
> +
> +	pcmpeqb	16(%rax), %xmm1
> +	add	$16, %rax
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit)
> +
> +	test	$0x3f, %rax
> +	jz	L(align64_loop)
> +
> +	pcmpeqb	16(%rax), %xmm2
> +	add	$16, %rax
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit)
> +
> +	test	$0x3f, %rax
> +	jz	L(align64_loop)
> +
> +	pcmpeqb	16(%rax), %xmm3
> +	add	$16, %rax
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	jnz	L(exit)
> +
> +	add	$16, %rax
> +	.p2align 4
> +	L(align64_loop):
> +	movaps	(%rax),	%xmm4
> +	pminub	16(%rax),	%xmm4
> +	movaps	32(%rax),	%xmm5
> +	pminub	48(%rax),	%xmm5
> +	add	$64,	%rax
> +	pminub	%xmm4,	%xmm5
> +	pcmpeqb	%xmm0,	%xmm5
> +	pmovmskb %xmm5,	%edx
> +	test	%edx,	%edx
> +	jz	L(align64_loop)
> +
> +	pcmpeqb	-64(%rax), %xmm0
> +	sub	$80,	%rax
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	jnz	L(exit16)
> +
> +	pcmpeqb	32(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	jnz	L(exit32)
> +
> +	pcmpeqb	48(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	jnz	L(exit48)
> +
> +	pcmpeqb	64(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	sub	%rdi, %rax
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	add	$64, %rax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit):
> +	sub	%rdi, %rax
> +L(exit_less16):
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit16):
> +	sub	%rdi, %rax
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	add	$16, %rax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit32):
> +	sub	%rdi, %rax
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	add	$32, %rax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit48):
> +	sub	%rdi, %rax
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	add	$48, %rax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit64):
> +	sub	%rdi, %rax
> +	bsf	%rdx, %rdx
> +	add	%rdx, %rax
> +	add	$64, %rax
> +
> +	.p2align 4
> +L(StartStrcpyPart):
> +	lea	(%r9, %rax), %rdi
> +	mov	%rsi, %rcx
> +	mov	%r9, %rax      /* save result */
> +
> +# ifdef USE_AS_STRNCAT
> +	test	%r8, %r8
> +	jz	L(ExitZero)
> +#  define USE_AS_STRNCPY
> +#  include "strcpy-sse2-unaligned.S"
> +
> +# else
> +	jmp __strcpy_sse2_unaligned_tail
> +  END (STRCAT)
> +# endif
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> index 6c45ff3..a76075c 100644
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> @@ -1,3 +1,870 @@
> +/* strncat with SSSE3
> +   Copyright (C) 2011-2013 Free Software Foundation, Inc.

How did you determine the copyright years?

Was it from the original file you inlined?

> +   Contributed by Intel Corporation.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
>  #define USE_AS_STRNCAT
>  #define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +#  define STRCAT  __strcat_ssse3
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> +# ifdef USE_AS_STRNCAT
> +	mov	%rdx, %r8
> +# endif
> +
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> +   implementation gets merged.  */
> +
> +	xor	%eax, %eax
> +	cmpb	$0, (%rdi)
> +	jz	L(exit_tail0)
> +	cmpb	$0, 1(%rdi)
> +	jz	L(exit_tail1)
> +	cmpb	$0, 2(%rdi)
> +	jz	L(exit_tail2)
> +	cmpb	$0, 3(%rdi)
> +	jz	L(exit_tail3)
> +
> +	cmpb	$0, 4(%rdi)
> +	jz	L(exit_tail4)
> +	cmpb	$0, 5(%rdi)
> +	jz	L(exit_tail5)
> +	cmpb	$0, 6(%rdi)
> +	jz	L(exit_tail6)
> +	cmpb	$0, 7(%rdi)
> +	jz	L(exit_tail7)
> +
> +	cmpb	$0, 8(%rdi)
> +	jz	L(exit_tail8)
> +	cmpb	$0, 9(%rdi)
> +	jz	L(exit_tail9)
> +	cmpb	$0, 10(%rdi)
> +	jz	L(exit_tail10)
> +	cmpb	$0, 11(%rdi)
> +	jz	L(exit_tail11)
> +
> +	cmpb	$0, 12(%rdi)
> +	jz	L(exit_tail12)
> +	cmpb	$0, 13(%rdi)
> +	jz	L(exit_tail13)
> +	cmpb	$0, 14(%rdi)
> +	jz	L(exit_tail14)
> +	cmpb	$0, 15(%rdi)
> +	jz	L(exit_tail15)
> +	pxor	%xmm0, %xmm0
> +	lea	16(%rdi), %rcx
> +	lea	16(%rdi), %rax
> +	and	$-16, %rax
> +
> +	pcmpeqb	(%rax), %xmm0
> +	pmovmskb %xmm0, %edx
> +	pxor	%xmm1, %xmm1
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	pxor	%xmm2, %xmm2
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	pxor	%xmm3, %xmm3
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm0
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm0
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm0
> +	pmovmskb %xmm0, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm1
> +	pmovmskb %xmm1, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm2
> +	pmovmskb %xmm2, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	pcmpeqb	(%rax), %xmm3
> +	pmovmskb %xmm3, %edx
> +	test	%edx, %edx
> +	lea	16(%rax), %rax
> +	jnz	L(exit)
> +
> +	and	$-0x40, %rax
> +
> +	.p2align 4
> +L(aligned_64):
> +	pcmpeqb	(%rax), %xmm0
> +	pcmpeqb	16(%rax), %xmm1
> +	pcmpeqb	32(%rax), %xmm2
> +	pcmpeqb	48(%rax), %xmm3
> +	pmovmskb %xmm0, %edx
> +	pmovmskb %xmm1, %r11d
> +	pmovmskb %xmm2, %r10d
> +	pmovmskb %xmm3, %r9d
> +	or	%edx, %r9d
> +	or	%r11d, %r9d
> +	or	%r10d, %r9d
> +	lea	64(%rax), %rax
> +	jz	L(aligned_64)
> +
> +	test	%edx, %edx
> +	jnz	L(aligned_64_exit_16)
> +	test	%r11d, %r11d
> +	jnz	L(aligned_64_exit_32)
> +	test	%r10d, %r10d
> +	jnz	L(aligned_64_exit_48)
> +
> +L(aligned_64_exit_64):
> +	pmovmskb %xmm3, %edx
> +	jmp	L(exit)
> +
> +L(aligned_64_exit_48):
> +	lea	-16(%rax), %rax
> +	mov	%r10d, %edx
> +	jmp	L(exit)
> +
> +L(aligned_64_exit_32):
> +	lea	-32(%rax), %rax
> +	mov	%r11d, %edx
> +	jmp	L(exit)
> +
> +L(aligned_64_exit_16):
> +	lea	-48(%rax), %rax
> +
> +L(exit):
> +	sub	%rcx, %rax
> +	test	%dl, %dl
> +	jz	L(exit_high)
> +	test	$0x01, %dl
> +	jnz	L(exit_tail0)
> +
> +	test	$0x02, %dl
> +	jnz	L(exit_tail1)
> +
> +	test	$0x04, %dl
> +	jnz	L(exit_tail2)
> +
> +	test	$0x08, %dl
> +	jnz	L(exit_tail3)
> +
> +	test	$0x10, %dl
> +	jnz	L(exit_tail4)
> +
> +	test	$0x20, %dl
> +	jnz	L(exit_tail5)
> +
> +	test	$0x40, %dl
> +	jnz	L(exit_tail6)
> +	add	$7, %eax
> +L(exit_tail0):
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_high):
> +	add	$8, %eax
> +	test	$0x01, %dh
> +	jnz	L(exit_tail0)
> +
> +	test	$0x02, %dh
> +	jnz	L(exit_tail1)
> +
> +	test	$0x04, %dh
> +	jnz	L(exit_tail2)
> +
> +	test	$0x08, %dh
> +	jnz	L(exit_tail3)
> +
> +	test	$0x10, %dh
> +	jnz	L(exit_tail4)
> +
> +	test	$0x20, %dh
> +	jnz	L(exit_tail5)
> +
> +	test	$0x40, %dh
> +	jnz	L(exit_tail6)
> +	add	$7, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail1):
> +	add	$1, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail2):
> +	add	$2, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail3):
> +	add	$3, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail4):
> +	add	$4, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail5):
> +	add	$5, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail6):
> +	add	$6, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail7):
> +	add	$7, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail8):
> +	add	$8, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail9):
> +	add	$9, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail10):
> +	add	$10, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail11):
> +	add	$11, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail12):
> +	add	$12, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail13):
> +	add	$13, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail14):
> +	add	$14, %eax
> +	jmp	L(StartStrcpyPart)
> +
> +	.p2align 4
> +L(exit_tail15):
> +	add	$15, %eax
> +
> +	.p2align 4
> +L(StartStrcpyPart):
> +	mov	%rsi, %rcx
> +	lea	(%rdi, %rax), %rdx
> +# ifdef USE_AS_STRNCAT
> +	test	%r8, %r8
> +	jz	L(StrncatExit0)
> +	cmp	$8, %r8
> +	jbe	L(StrncatExit8Bytes)
> +# endif
> +	cmpb	$0, (%rcx)
> +	jz	L(Exit1)
> +	cmpb	$0, 1(%rcx)
> +	jz	L(Exit2)
> +	cmpb	$0, 2(%rcx)
> +	jz	L(Exit3)
> +	cmpb	$0, 3(%rcx)
> +	jz	L(Exit4)
> +	cmpb	$0, 4(%rcx)
> +	jz	L(Exit5)
> +	cmpb	$0, 5(%rcx)
> +	jz	L(Exit6)
> +	cmpb	$0, 6(%rcx)
> +	jz	L(Exit7)
> +	cmpb	$0, 7(%rcx)
> +	jz	L(Exit8)
> +	cmpb	$0, 8(%rcx)
> +	jz	L(Exit9)
> +# ifdef USE_AS_STRNCAT
> +	cmp	$16, %r8
> +	jb	L(StrncatExit15Bytes)
> +# endif
> +	cmpb	$0, 9(%rcx)
> +	jz	L(Exit10)
> +	cmpb	$0, 10(%rcx)
> +	jz	L(Exit11)
> +	cmpb	$0, 11(%rcx)
> +	jz	L(Exit12)
> +	cmpb	$0, 12(%rcx)
> +	jz	L(Exit13)
> +	cmpb	$0, 13(%rcx)
> +	jz	L(Exit14)
> +	cmpb	$0, 14(%rcx)
> +	jz	L(Exit15)
> +	cmpb	$0, 15(%rcx)
> +	jz	L(Exit16)
> +# ifdef USE_AS_STRNCAT
> +	cmp	$16, %r8
> +	je	L(StrncatExit16)
> +#  define USE_AS_STRNCPY
> +# endif
> +
> +# include "strcpy-ssse3.S"
> +
> +	.p2align 4
> +L(CopyFrom1To16Bytes):
> +	add	%rsi, %rdx
> +	add	%rsi, %rcx
> +
> +	test	%al, %al
> +	jz	L(ExitHigh)
> +	test	$0x01, %al
> +	jnz	L(Exit1)
> +	test	$0x02, %al
> +	jnz	L(Exit2)
> +	test	$0x04, %al
> +	jnz	L(Exit3)
> +	test	$0x08, %al
> +	jnz	L(Exit4)
> +	test	$0x10, %al
> +	jnz	L(Exit5)
> +	test	$0x20, %al
> +	jnz	L(Exit6)
> +	test	$0x40, %al
> +	jnz	L(Exit7)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(ExitHigh):
> +	test	$0x01, %ah
> +	jnz	L(Exit9)
> +	test	$0x02, %ah
> +	jnz	L(Exit10)
> +	test	$0x04, %ah
> +	jnz	L(Exit11)
> +	test	$0x08, %ah
> +	jnz	L(Exit12)
> +	test	$0x10, %ah
> +	jnz	L(Exit13)
> +	test	$0x20, %ah
> +	jnz	L(Exit14)
> +	test	$0x40, %ah
> +	jnz	L(Exit15)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	8(%rcx), %xmm1
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	%xmm1, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit1):
> +	xor	%ah, %ah
> +	movb	%ah, 1(%rdx)
> +L(Exit1):
> +	movb	(%rcx), %al
> +	movb	%al, (%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit2):
> +	xor	%ah, %ah
> +	movb	%ah, 2(%rdx)
> +L(Exit2):
> +	movw	(%rcx), %ax
> +	movw	%ax, (%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit3):
> +	xor	%ah, %ah
> +	movb	%ah, 3(%rdx)
> +L(Exit3):
> +	movw	(%rcx), %ax
> +	movw	%ax, (%rdx)
> +	movb	2(%rcx), %al
> +	movb	%al, 2(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit4):
> +	xor	%ah, %ah
> +	movb	%ah, 4(%rdx)
> +L(Exit4):
> +	mov	(%rcx), %eax
> +	mov	%eax, (%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit5):
> +	xor	%ah, %ah
> +	movb	%ah, 5(%rdx)
> +L(Exit5):
> +	mov	(%rcx), %eax
> +	mov	%eax, (%rdx)
> +	movb	4(%rcx), %al
> +	movb	%al, 4(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit6):
> +	xor	%ah, %ah
> +	movb	%ah, 6(%rdx)
> +L(Exit6):
> +	mov	(%rcx), %eax
> +	mov	%eax, (%rdx)
> +	movw	4(%rcx), %ax
> +	movw	%ax, 4(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit7):
> +	xor	%ah, %ah
> +	movb	%ah, 7(%rdx)
> +L(Exit7):
> +	mov	(%rcx), %eax
> +	mov	%eax, (%rdx)
> +	mov	3(%rcx), %eax
> +	mov	%eax, 3(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit8):
> +	xor	%ah, %ah
> +	movb	%ah, 8(%rdx)
> +L(Exit8):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit9):
> +	xor	%ah, %ah
> +	movb	%ah, 9(%rdx)
> +L(Exit9):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movb	8(%rcx), %al
> +	movb	%al, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit10):
> +	xor	%ah, %ah
> +	movb	%ah, 10(%rdx)
> +L(Exit10):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movw	8(%rcx), %ax
> +	movw	%ax, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit11):
> +	xor	%ah, %ah
> +	movb	%ah, 11(%rdx)
> +L(Exit11):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	mov	7(%rcx), %eax
> +	mov	%eax, 7(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit12):
> +	xor	%ah, %ah
> +	movb	%ah, 12(%rdx)
> +L(Exit12):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	mov	8(%rcx), %eax
> +	mov	%eax, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit13):
> +	xor	%ah, %ah
> +	movb	%ah, 13(%rdx)
> +L(Exit13):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	5(%rcx), %xmm1
> +	movlpd	%xmm1, 5(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit14):
> +	xor	%ah, %ah
> +	movb	%ah, 14(%rdx)
> +L(Exit14):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	6(%rcx), %xmm1
> +	movlpd	%xmm1, 6(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit15):
> +	xor	%ah, %ah
> +	movb	%ah, 15(%rdx)
> +L(Exit15):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	7(%rcx), %xmm1
> +	movlpd	%xmm1, 7(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit16):
> +	xor	%ah, %ah
> +	movb	%ah, 16(%rdx)
> +L(Exit16):
> +	movlpd	(%rcx), %xmm0
> +	movlpd	8(%rcx), %xmm1
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	%xmm1, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +# ifdef USE_AS_STRNCPY
> +
> +	.p2align 4
> +L(CopyFrom1To16BytesCase2):
> +	add	$16, %r8
> +	add	%rsi, %rcx
> +	lea	(%rsi, %rdx), %rsi
> +	lea	-9(%r8), %rdx
> +	and	$1<<7, %dh
> +	or	%al, %dh
> +	test	%dh, %dh
> +	lea	(%rsi), %rdx
> +	jz	L(ExitHighCase2)
> +
> +	test	$0x01, %al
> +	jnz	L(Exit1)
> +	cmp	$1, %r8
> +	je	L(StrncatExit1)
> +	test	$0x02, %al
> +	jnz	L(Exit2)
> +	cmp	$2, %r8
> +	je	L(StrncatExit2)
> +	test	$0x04, %al
> +	jnz	L(Exit3)
> +	cmp	$3, %r8
> +	je	L(StrncatExit3)
> +	test	$0x08, %al
> +	jnz	L(Exit4)
> +	cmp	$4, %r8
> +	je	L(StrncatExit4)
> +	test	$0x10, %al
> +	jnz	L(Exit5)
> +	cmp	$5, %r8
> +	je	L(StrncatExit5)
> +	test	$0x20, %al
> +	jnz	L(Exit6)
> +	cmp	$6, %r8
> +	je	L(StrncatExit6)
> +	test	$0x40, %al
> +	jnz	L(Exit7)
> +	cmp	$7, %r8
> +	je	L(StrncatExit7)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	lea	7(%rdx), %rax
> +	cmpb	$1, (%rax)
> +	sbb	$-1, %rax
> +	xor	%cl, %cl
> +	movb	%cl, (%rax)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(ExitHighCase2):
> +	test	$0x01, %ah
> +	jnz	L(Exit9)
> +	cmp	$9, %r8
> +	je	L(StrncatExit9)
> +	test	$0x02, %ah
> +	jnz	L(Exit10)
> +	cmp	$10, %r8
> +	je	L(StrncatExit10)
> +	test	$0x04, %ah
> +	jnz	L(Exit11)
> +	cmp	$11, %r8
> +	je	L(StrncatExit11)
> +	test	$0x8, %ah
> +	jnz	L(Exit12)
> +	cmp	$12, %r8
> +	je	L(StrncatExit12)
> +	test	$0x10, %ah
> +	jnz	L(Exit13)
> +	cmp	$13, %r8
> +	je	L(StrncatExit13)
> +	test	$0x20, %ah
> +	jnz	L(Exit14)
> +	cmp	$14, %r8
> +	je	L(StrncatExit14)
> +	test	$0x40, %ah
> +	jnz	L(Exit15)
> +	cmp	$15, %r8
> +	je	L(StrncatExit15)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	8(%rcx), %xmm1
> +	movlpd	%xmm1, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +L(CopyFrom1To16BytesCase2OrCase3):
> +	test	%rax, %rax
> +	jnz	L(CopyFrom1To16BytesCase2)
> +
> +	.p2align 4
> +L(CopyFrom1To16BytesCase3):
> +	add	$16, %r8
> +	add	%rsi, %rdx
> +	add	%rsi, %rcx
> +
> +	cmp	$8, %r8
> +	ja	L(ExitHighCase3)
> +	cmp	$1, %r8
> +	je	L(StrncatExit1)
> +	cmp	$2, %r8
> +	je	L(StrncatExit2)
> +	cmp	$3, %r8
> +	je	L(StrncatExit3)
> +	cmp	$4, %r8
> +	je	L(StrncatExit4)
> +	cmp	$5, %r8
> +	je	L(StrncatExit5)
> +	cmp	$6, %r8
> +	je	L(StrncatExit6)
> +	cmp	$7, %r8
> +	je	L(StrncatExit7)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	xor	%ah, %ah
> +	movb	%ah, 8(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(ExitHighCase3):
> +	cmp	$9, %r8
> +	je	L(StrncatExit9)
> +	cmp	$10, %r8
> +	je	L(StrncatExit10)
> +	cmp	$11, %r8
> +	je	L(StrncatExit11)
> +	cmp	$12, %r8
> +	je	L(StrncatExit12)
> +	cmp	$13, %r8
> +	je	L(StrncatExit13)
> +	cmp	$14, %r8
> +	je	L(StrncatExit14)
> +	cmp	$15, %r8
> +	je	L(StrncatExit15)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	8(%rcx), %xmm1
> +	movlpd	%xmm1, 8(%rdx)
> +	xor	%ah, %ah
> +	movb	%ah, 16(%rdx)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit0):
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit15Bytes):
> +	cmp	$9, %r8
> +	je	L(StrncatExit9)
> +	cmpb	$0, 9(%rcx)
> +	jz	L(Exit10)
> +	cmp	$10, %r8
> +	je	L(StrncatExit10)
> +	cmpb	$0, 10(%rcx)
> +	jz	L(Exit11)
> +	cmp	$11, %r8
> +	je	L(StrncatExit11)
> +	cmpb	$0, 11(%rcx)
> +	jz	L(Exit12)
> +	cmp	$12, %r8
> +	je	L(StrncatExit12)
> +	cmpb	$0, 12(%rcx)
> +	jz	L(Exit13)
> +	cmp	$13, %r8
> +	je	L(StrncatExit13)
> +	cmpb	$0, 13(%rcx)
> +	jz	L(Exit14)
> +	cmp	$14, %r8
> +	je	L(StrncatExit14)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	movlpd	7(%rcx), %xmm1
> +	movlpd	%xmm1, 7(%rdx)
> +	lea	14(%rdx), %rax
> +	cmpb	$1, (%rax)
> +	sbb	$-1, %rax
> +	xor	%cl, %cl
> +	movb	%cl, (%rax)
> +	mov	%rdi, %rax
> +	ret
> +
> +	.p2align 4
> +L(StrncatExit8Bytes):
> +	cmpb	$0, (%rcx)
> +	jz	L(Exit1)
> +	cmp	$1, %r8
> +	je	L(StrncatExit1)
> +	cmpb	$0, 1(%rcx)
> +	jz	L(Exit2)
> +	cmp	$2, %r8
> +	je	L(StrncatExit2)
> +	cmpb	$0, 2(%rcx)
> +	jz	L(Exit3)
> +	cmp	$3, %r8
> +	je	L(StrncatExit3)
> +	cmpb	$0, 3(%rcx)
> +	jz	L(Exit4)
> +	cmp	$4, %r8
> +	je	L(StrncatExit4)
> +	cmpb	$0, 4(%rcx)
> +	jz	L(Exit5)
> +	cmp	$5, %r8
> +	je	L(StrncatExit5)
> +	cmpb	$0, 5(%rcx)
> +	jz	L(Exit6)
> +	cmp	$6, %r8
> +	je	L(StrncatExit6)
> +	cmpb	$0, 6(%rcx)
> +	jz	L(Exit7)
> +	cmp	$7, %r8
> +	je	L(StrncatExit7)
> +	movlpd	(%rcx), %xmm0
> +	movlpd	%xmm0, (%rdx)
> +	lea	7(%rdx), %rax
> +	cmpb	$1, (%rax)
> +	sbb	$-1, %rax
> +	xor	%cl, %cl
> +	movb	%cl, (%rax)
> +	mov	%rdi, %rax
> +	ret
> +
> +# endif
> +END (STRCAT)
> +#endif
> 

Cheers,
Carlos.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]