This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/2] Improve strcpy: Rename strcpy-sse2-unaligned.S.
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Mon, 9 Sep 2013 19:40:46 +0400
- Subject: Re: [PATCH 1/2] Improve strcpy: Rename strcpy-sse2-unaligned.S.
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz>
Why do you need to move? Can't you just add your version as
sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-new.S and build
strcpy-sse2-unaligned-new.S in Makefile instead of
strcpy-sse2-unaligned.S
--
Liubov
Intel Corporation
On Mon, Sep 9, 2013 at 7:30 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Hi,
>
> This is first part of improving strcpy with unaligned loads. As I do not
> have code for strncpy/stpncpy/strncat this patch just moves
> strcpy-sse2-unaligned.S to make subsequent patch smaller.
>
> * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Move to ...
> * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-old.S: Here.
> * sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: Update
> include location.
> * sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: Likewise.
> * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Likewise.
> * sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: Likewise.
>
> ---
> sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 2 +-
> sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S | 2 +-
> sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 2 +-
> .../x86_64/multiarch/strcpy-sse2-unaligned-old.S | 1887 ++++++++++++++++++++
> sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 1887 --------------------
> sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S | 2 +-
> 6 files changed, 1891 insertions(+), 1891 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-old.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
>
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> index 34231f8..8f863e5 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
> @@ -1,3 +1,3 @@
> #define USE_AS_STPCPY
> #define STRCPY __stpcpy_sse2_unaligned
> -#include "strcpy-sse2-unaligned.S"
> +#include "strcpy-sse2-unaligned-old.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
> index 658520f..90d1533 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
> @@ -1,4 +1,4 @@
> #define USE_AS_STPCPY
> #define USE_AS_STRNCPY
> #define STRCPY __stpncpy_sse2_unaligned
> -#include "strcpy-sse2-unaligned.S"
> +#include "strcpy-sse2-unaligned-old.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..6c38882 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> @@ -275,5 +275,5 @@ L(StartStrcpyPart):
> # define USE_AS_STRNCPY
> # endif
>
> -# include "strcpy-sse2-unaligned.S"
> +# include "strcpy-sse2-unaligned-old.S"
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-old.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-old.S
> new file mode 100644
> index 0000000..7710173
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-old.S
> @@ -0,0 +1,1887 @@
> +/* strcpy with SSE2 and unaligned load
> + Copyright (C) 2011-2013 Free Software Foundation, Inc.
> + Contributed by Intel Corporation.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#ifndef NOT_IN_libc
> +
> +# ifndef USE_AS_STRCAT
> +# include <sysdep.h>
> +
> +# ifndef STRCPY
> +# define STRCPY __strcpy_sse2_unaligned
> +# endif
> +
> +# endif
> +
> +# define JMPTBL(I, B) I - B
> +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> + lea TABLE(%rip), %r11; \
> + movslq (%r11, INDEX, SCALE), %rcx; \
> + lea (%r11, %rcx), %rcx; \
> + jmp *%rcx
> +
> +# ifndef USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCPY)
> +# ifdef USE_AS_STRNCPY
> + mov %rdx, %r8
> + test %r8, %r8
> + jz L(ExitZero)
> +# endif
> + mov %rsi, %rcx
> +# ifndef USE_AS_STPCPY
> + mov %rdi, %rax /* save result */
> +# endif
> +
> +# endif
> +
> + and $63, %rcx
> + cmp $32, %rcx
> + jbe L(SourceStringAlignmentLess32)
> +
> + and $-16, %rsi
> + and $15, %rcx
> + pxor %xmm0, %xmm0
> + pxor %xmm1, %xmm1
> +
> + pcmpeqb (%rsi), %xmm1
> + pmovmskb %xmm1, %rdx
> + shr %cl, %rdx
> +
> +# ifdef USE_AS_STRNCPY
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> + mov $16, %r10
> + sub %rcx, %r10
> + cmp %r10, %r8
> +# else
> + mov $17, %r10
> + sub %rcx, %r10
> + cmp %r10, %r8
> +# endif
> + jbe L(CopyFrom1To16BytesTailCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesTail)
> +
> + pcmpeqb 16(%rsi), %xmm0
> + pmovmskb %xmm0, %rdx
> +
> +# ifdef USE_AS_STRNCPY
> + add $16, %r10
> + cmp %r10, %r8
> + jbe L(CopyFrom1To32BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jnz L(CopyFrom1To32Bytes)
> +
> + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
> + movdqu %xmm1, (%rdi)
> +
> +/* If source address alignment != destination address alignment */
> + .p2align 4
> +L(Unalign16Both):
> + sub %rcx, %rdi
> +# ifdef USE_AS_STRNCPY
> + add %rcx, %r8
> +# endif
> + mov $16, %rcx
> + movdqa (%rsi, %rcx), %xmm1
> + movaps 16(%rsi, %rcx), %xmm2
> + movdqu %xmm1, (%rdi, %rcx)
> + pcmpeqb %xmm2, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $48, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm2)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movaps 16(%rsi, %rcx), %xmm3
> + movdqu %xmm2, (%rdi, %rcx)
> + pcmpeqb %xmm3, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm3)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movaps 16(%rsi, %rcx), %xmm4
> + movdqu %xmm3, (%rdi, %rcx)
> + pcmpeqb %xmm4, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm4)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movaps 16(%rsi, %rcx), %xmm1
> + movdqu %xmm4, (%rdi, %rcx)
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm1)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movaps 16(%rsi, %rcx), %xmm2
> + movdqu %xmm1, (%rdi, %rcx)
> + pcmpeqb %xmm2, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm2)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movaps 16(%rsi, %rcx), %xmm3
> + movdqu %xmm2, (%rdi, %rcx)
> + pcmpeqb %xmm3, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $16, %rcx
> +# ifdef USE_AS_STRNCPY
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm3)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + movdqu %xmm3, (%rdi, %rcx)
> + mov %rsi, %rdx
> + lea 16(%rsi, %rcx), %rsi
> + and $-0x40, %rsi
> + sub %rsi, %rdx
> + sub %rdx, %rdi
> +# ifdef USE_AS_STRNCPY
> + lea 128(%r8, %rdx), %r8
> +# endif
> +L(Unaligned64Loop):
> + movaps (%rsi), %xmm2
> + movaps %xmm2, %xmm4
> + movaps 16(%rsi), %xmm5
> + movaps 32(%rsi), %xmm3
> + movaps %xmm3, %xmm6
> + movaps 48(%rsi), %xmm7
> + pminub %xmm5, %xmm2
> + pminub %xmm7, %xmm3
> + pminub %xmm2, %xmm3
> + pcmpeqb %xmm0, %xmm3
> + pmovmskb %xmm3, %rdx
> +# ifdef USE_AS_STRNCPY
> + sub $64, %r8
> + jbe L(UnalignedLeaveCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jnz L(Unaligned64Leave)
> +
> +L(Unaligned64Loop_start):
> + add $64, %rdi
> + add $64, %rsi
> + movdqu %xmm4, -64(%rdi)
> + movaps (%rsi), %xmm2
> + movdqa %xmm2, %xmm4
> + movdqu %xmm5, -48(%rdi)
> + movaps 16(%rsi), %xmm5
> + pminub %xmm5, %xmm2
> + movaps 32(%rsi), %xmm3
> + movdqu %xmm6, -32(%rdi)
> + movaps %xmm3, %xmm6
> + movdqu %xmm7, -16(%rdi)
> + movaps 48(%rsi), %xmm7
> + pminub %xmm7, %xmm3
> + pminub %xmm2, %xmm3
> + pcmpeqb %xmm0, %xmm3
> + pmovmskb %xmm3, %rdx
> +# ifdef USE_AS_STRNCPY
> + sub $64, %r8
> + jbe L(UnalignedLeaveCase2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jz L(Unaligned64Loop_start)
> +
> +L(Unaligned64Leave):
> + pxor %xmm1, %xmm1
> +
> + pcmpeqb %xmm4, %xmm0
> + pcmpeqb %xmm5, %xmm1
> + pmovmskb %xmm0, %rdx
> + pmovmskb %xmm1, %rcx
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesUnaligned_0)
> + test %rcx, %rcx
> + jnz L(CopyFrom1To16BytesUnaligned_16)
> +
> + pcmpeqb %xmm6, %xmm0
> + pcmpeqb %xmm7, %xmm1
> + pmovmskb %xmm0, %rdx
> + pmovmskb %xmm1, %rcx
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesUnaligned_32)
> +
> + bsf %rcx, %rdx
> + movdqu %xmm4, (%rdi)
> + movdqu %xmm5, 16(%rdi)
> + movdqu %xmm6, 32(%rdi)
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> +# ifdef USE_AS_STPCPY
> + lea 48(%rdi, %rdx), %rax
> +# endif
> + movdqu %xmm7, 48(%rdi)
> + add $15, %r8
> + sub %rdx, %r8
> + lea 49(%rdi, %rdx), %rdi
> + jmp L(StrncpyFillTailWithZero)
> +# else
> + add $48, %rsi
> + add $48, %rdi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +# endif
> +
> +/* If source address alignment == destination address alignment */
> +
> +L(SourceStringAlignmentLess32):
> + pxor %xmm0, %xmm0
> + movdqu (%rsi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %rdx
> +
> +# ifdef USE_AS_STRNCPY
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> + cmp $16, %r8
> +# else
> + cmp $17, %r8
> +# endif
> + jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesTail1)
> +
> + pcmpeqb %xmm2, %xmm0
> + movdqu %xmm1, (%rdi)
> + pmovmskb %xmm0, %rdx
> +
> +# ifdef USE_AS_STRNCPY
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> + cmp $32, %r8
> +# else
> + cmp $33, %r8
> +# endif
> + jbe L(CopyFrom1To32Bytes1Case2OrCase3)
> +# endif
> + test %rdx, %rdx
> + jnz L(CopyFrom1To32Bytes1)
> +
> + and $-16, %rsi
> + and $15, %rcx
> + jmp L(Unalign16Both)
> +
> +/*------End of main part with loops---------------------*/
> +
> +/* Case1 */
> +
> +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> + .p2align 4
> +L(CopyFrom1To16Bytes):
> + add %rcx, %rdi
> + add %rcx, %rsi
> + bsf %rdx, %rdx
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +# endif
> + .p2align 4
> +L(CopyFrom1To16BytesTail):
> + add %rcx, %rsi
> + bsf %rdx, %rdx
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +
> + .p2align 4
> +L(CopyFrom1To32Bytes1):
> + add $16, %rsi
> + add $16, %rdi
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $16, %r8
> +# endif
> +L(CopyFrom1To16BytesTail1):
> + bsf %rdx, %rdx
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +
> + .p2align 4
> +L(CopyFrom1To32Bytes):
> + bsf %rdx, %rdx
> + add %rcx, %rsi
> + add $16, %rdx
> + sub %rcx, %rdx
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnaligned_0):
> + bsf %rdx, %rdx
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> +# ifdef USE_AS_STPCPY
> + lea (%rdi, %rdx), %rax
> +# endif
> + movdqu %xmm4, (%rdi)
> + add $63, %r8
> + sub %rdx, %r8
> + lea 1(%rdi, %rdx), %rdi
> + jmp L(StrncpyFillTailWithZero)
> +# else
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +# endif
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnaligned_16):
> + bsf %rcx, %rdx
> + movdqu %xmm4, (%rdi)
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> +# ifdef USE_AS_STPCPY
> + lea 16(%rdi, %rdx), %rax
> +# endif
> + movdqu %xmm5, 16(%rdi)
> + add $47, %r8
> + sub %rdx, %r8
> + lea 17(%rdi, %rdx), %rdi
> + jmp L(StrncpyFillTailWithZero)
> +# else
> + add $16, %rsi
> + add $16, %rdi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +# endif
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnaligned_32):
> + bsf %rdx, %rdx
> + movdqu %xmm4, (%rdi)
> + movdqu %xmm5, 16(%rdi)
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> +# ifdef USE_AS_STPCPY
> + lea 32(%rdi, %rdx), %rax
> +# endif
> + movdqu %xmm6, 32(%rdi)
> + add $31, %r8
> + sub %rdx, %r8
> + lea 33(%rdi, %rdx), %rdi
> + jmp L(StrncpyFillTailWithZero)
> +# else
> + add $32, %rsi
> + add $32, %rdi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +# endif
> +
> +# ifdef USE_AS_STRNCPY
> +# ifndef USE_AS_STRCAT
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm6):
> + movdqu %xmm6, (%rdi, %rcx)
> + jmp L(CopyFrom1To16BytesXmmExit)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm5):
> + movdqu %xmm5, (%rdi, %rcx)
> + jmp L(CopyFrom1To16BytesXmmExit)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm4):
> + movdqu %xmm4, (%rdi, %rcx)
> + jmp L(CopyFrom1To16BytesXmmExit)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm3):
> + movdqu %xmm3, (%rdi, %rcx)
> + jmp L(CopyFrom1To16BytesXmmExit)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm1):
> + movdqu %xmm1, (%rdi, %rcx)
> + jmp L(CopyFrom1To16BytesXmmExit)
> +# endif
> +
> + .p2align 4
> +L(CopyFrom1To16BytesExit):
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> +
> +/* Case2 */
> +
> + .p2align 4
> +L(CopyFrom1To16BytesCase2):
> + add $16, %r8
> + add %rcx, %rdi
> + add %rcx, %rsi
> + bsf %rdx, %rdx
> + cmp %r8, %rdx
> + jb L(CopyFrom1To16BytesExit)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> + .p2align 4
> +L(CopyFrom1To32BytesCase2):
> + add %rcx, %rsi
> + bsf %rdx, %rdx
> + add $16, %rdx
> + sub %rcx, %rdx
> + cmp %r8, %rdx
> + jb L(CopyFrom1To16BytesExit)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> +L(CopyFrom1To16BytesTailCase2):
> + add %rcx, %rsi
> + bsf %rdx, %rdx
> + cmp %r8, %rdx
> + jb L(CopyFrom1To16BytesExit)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> +L(CopyFrom1To16BytesTail1Case2):
> + bsf %rdx, %rdx
> + cmp %r8, %rdx
> + jb L(CopyFrom1To16BytesExit)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> +/* Case2 or Case3, Case3 */
> +
> + .p2align 4
> +L(CopyFrom1To16BytesCase2OrCase3):
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesCase2)
> +L(CopyFrom1To16BytesCase3):
> + add $16, %r8
> + add %rcx, %rdi
> + add %rcx, %rsi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> + .p2align 4
> +L(CopyFrom1To32BytesCase2OrCase3):
> + test %rdx, %rdx
> + jnz L(CopyFrom1To32BytesCase2)
> + add %rcx, %rsi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesTailCase2OrCase3):
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesTailCase2)
> + add %rcx, %rsi
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> + .p2align 4
> +L(CopyFrom1To32Bytes1Case2OrCase3):
> + add $16, %rdi
> + add $16, %rsi
> + sub $16, %r8
> +L(CopyFrom1To16BytesTail1Case2OrCase3):
> + test %rdx, %rdx
> + jnz L(CopyFrom1To16BytesTail1Case2)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> +# endif
> +
> +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
> +
> + .p2align 4
> +L(Exit1):
> + mov %dh, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea (%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $1, %r8
> + lea 1(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit2):
> + mov (%rsi), %dx
> + mov %dx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 1(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $2, %r8
> + lea 2(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit3):
> + mov (%rsi), %cx
> + mov %cx, (%rdi)
> + mov %dh, 2(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 2(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $3, %r8
> + lea 3(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit4):
> + mov (%rsi), %edx
> + mov %edx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 3(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $4, %r8
> + lea 4(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit5):
> + mov (%rsi), %ecx
> + mov %dh, 4(%rdi)
> + mov %ecx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 4(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $5, %r8
> + lea 5(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit6):
> + mov (%rsi), %ecx
> + mov 4(%rsi), %dx
> + mov %ecx, (%rdi)
> + mov %dx, 4(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 5(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $6, %r8
> + lea 6(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit7):
> + mov (%rsi), %ecx
> + mov 3(%rsi), %edx
> + mov %ecx, (%rdi)
> + mov %edx, 3(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 6(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $7, %r8
> + lea 7(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit8):
> + mov (%rsi), %rdx
> + mov %rdx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 7(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $8, %r8
> + lea 8(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit9):
> + mov (%rsi), %rcx
> + mov %dh, 8(%rdi)
> + mov %rcx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 8(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $9, %r8
> + lea 9(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit10):
> + mov (%rsi), %rcx
> + mov 8(%rsi), %dx
> + mov %rcx, (%rdi)
> + mov %dx, 8(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 9(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $10, %r8
> + lea 10(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit11):
> + mov (%rsi), %rcx
> + mov 7(%rsi), %edx
> + mov %rcx, (%rdi)
> + mov %edx, 7(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 10(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $11, %r8
> + lea 11(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit12):
> + mov (%rsi), %rcx
> + mov 8(%rsi), %edx
> + mov %rcx, (%rdi)
> + mov %edx, 8(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 11(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $12, %r8
> + lea 12(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit13):
> + mov (%rsi), %rcx
> + mov 5(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 5(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 12(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $13, %r8
> + lea 13(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit14):
> + mov (%rsi), %rcx
> + mov 6(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 6(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 13(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $14, %r8
> + lea 14(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit15):
> + mov (%rsi), %rcx
> + mov 7(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 7(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 14(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $15, %r8
> + lea 15(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit16):
> + movdqu (%rsi), %xmm0
> + movdqu %xmm0, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 15(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $16, %r8
> + lea 16(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit17):
> + movdqu (%rsi), %xmm0
> + movdqu %xmm0, (%rdi)
> + mov %dh, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 16(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $17, %r8
> + lea 17(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit18):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %cx
> + movdqu %xmm0, (%rdi)
> + mov %cx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 17(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $18, %r8
> + lea 18(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit19):
> + movdqu (%rsi), %xmm0
> + mov 15(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 18(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $19, %r8
> + lea 19(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit20):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 19(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $20, %r8
> + lea 20(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit21):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 16(%rdi)
> + mov %dh, 20(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 20(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $21, %r8
> + lea 21(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit22):
> + movdqu (%rsi), %xmm0
> + mov 14(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 14(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 21(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $22, %r8
> + lea 22(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit23):
> + movdqu (%rsi), %xmm0
> + mov 15(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 22(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $23, %r8
> + lea 23(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit24):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 23(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $24, %r8
> + lea 24(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit25):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 16(%rdi)
> + mov %dh, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 24(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $25, %r8
> + lea 25(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit26):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 24(%rsi), %cx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %cx, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 25(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $26, %r8
> + lea 26(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit27):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 23(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %ecx, 23(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 26(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $27, %r8
> + lea 27(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit28):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 24(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %ecx, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 27(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $28, %r8
> + lea 28(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit29):
> + movdqu (%rsi), %xmm0
> + movdqu 13(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 13(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 28(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $29, %r8
> + lea 29(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit30):
> + movdqu (%rsi), %xmm0
> + movdqu 14(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 14(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 29(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $30, %r8
> + lea 30(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit31):
> + movdqu (%rsi), %xmm0
> + movdqu 15(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 30(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $31, %r8
> + lea 31(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Exit32):
> + movdqu (%rsi), %xmm0
> + movdqu 16(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 31(%rdi), %rax
> +# endif
> +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> + sub $32, %r8
> + lea 32(%rdi), %rdi
> + jnz L(StrncpyFillTailWithZero)
> +# endif
> + ret
> +
> +# ifdef USE_AS_STRNCPY
> +
> + .p2align 4
> +L(StrncpyExit0):
> +# ifdef USE_AS_STPCPY
> + mov %rdi, %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, (%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit1):
> + mov (%rsi), %dl
> + mov %dl, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 1(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 1(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit2):
> + mov (%rsi), %dx
> + mov %dx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 2(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 2(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit3):
> + mov (%rsi), %cx
> + mov 2(%rsi), %dl
> + mov %cx, (%rdi)
> + mov %dl, 2(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 3(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 3(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit4):
> + mov (%rsi), %edx
> + mov %edx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 4(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 4(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit5):
> + mov (%rsi), %ecx
> + mov 4(%rsi), %dl
> + mov %ecx, (%rdi)
> + mov %dl, 4(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 5(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 5(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit6):
> + mov (%rsi), %ecx
> + mov 4(%rsi), %dx
> + mov %ecx, (%rdi)
> + mov %dx, 4(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 6(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 6(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit7):
> + mov (%rsi), %ecx
> + mov 3(%rsi), %edx
> + mov %ecx, (%rdi)
> + mov %edx, 3(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 7(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 7(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit8):
> + mov (%rsi), %rdx
> + mov %rdx, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 8(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 8(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit9):
> + mov (%rsi), %rcx
> + mov 8(%rsi), %dl
> + mov %rcx, (%rdi)
> + mov %dl, 8(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 9(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 9(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit10):
> + mov (%rsi), %rcx
> + mov 8(%rsi), %dx
> + mov %rcx, (%rdi)
> + mov %dx, 8(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 10(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 10(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit11):
> + mov (%rsi), %rcx
> + mov 7(%rsi), %edx
> + mov %rcx, (%rdi)
> + mov %edx, 7(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 11(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 11(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit12):
> + mov (%rsi), %rcx
> + mov 8(%rsi), %edx
> + mov %rcx, (%rdi)
> + mov %edx, 8(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 12(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 12(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit13):
> + mov (%rsi), %rcx
> + mov 5(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 5(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 13(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 13(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit14):
> + mov (%rsi), %rcx
> + mov 6(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 6(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 14(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 14(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit15):
> + mov (%rsi), %rcx
> + mov 7(%rsi), %rdx
> + mov %rcx, (%rdi)
> + mov %rdx, 7(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 15(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 15(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit16):
> + movdqu (%rsi), %xmm0
> + movdqu %xmm0, (%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 16(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 16(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit17):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %cl
> + movdqu %xmm0, (%rdi)
> + mov %cl, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 17(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 17(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit18):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %cx
> + movdqu %xmm0, (%rdi)
> + mov %cx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 18(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 18(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit19):
> + movdqu (%rsi), %xmm0
> + mov 15(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 19(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 19(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit20):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 20(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 20(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit21):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %ecx
> + mov 20(%rsi), %dl
> + movdqu %xmm0, (%rdi)
> + mov %ecx, 16(%rdi)
> + mov %dl, 20(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 21(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 21(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit22):
> + movdqu (%rsi), %xmm0
> + mov 14(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 14(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 22(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 22(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit23):
> + movdqu (%rsi), %xmm0
> + mov 15(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 23(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 23(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit24):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rcx
> + movdqu %xmm0, (%rdi)
> + mov %rcx, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 24(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 24(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit25):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 24(%rsi), %cl
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %cl, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 25(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 25(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit26):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 24(%rsi), %cx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %cx, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 26(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 26(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit27):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 23(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %ecx, 23(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 27(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 27(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit28):
> + movdqu (%rsi), %xmm0
> + mov 16(%rsi), %rdx
> + mov 24(%rsi), %ecx
> + movdqu %xmm0, (%rdi)
> + mov %rdx, 16(%rdi)
> + mov %ecx, 24(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 28(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 28(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit29):
> + movdqu (%rsi), %xmm0
> + movdqu 13(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 13(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 29(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 29(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit30):
> + movdqu (%rsi), %xmm0
> + movdqu 14(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 14(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 30(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 30(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit31):
> + movdqu (%rsi), %xmm0
> + movdqu 15(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 15(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 31(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 31(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit32):
> + movdqu (%rsi), %xmm0
> + movdqu 16(%rsi), %xmm2
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 32(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 32(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(StrncpyExit33):
> + movdqu (%rsi), %xmm0
> + movdqu 16(%rsi), %xmm2
> + mov 32(%rsi), %cl
> + movdqu %xmm0, (%rdi)
> + movdqu %xmm2, 16(%rdi)
> + mov %cl, 32(%rdi)
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 33(%rdi)
> +# endif
> + ret
> +
> +# ifndef USE_AS_STRCAT
> +
> + .p2align 4
> +L(Fill0):
> + ret
> +
> + .p2align 4
> +L(Fill1):
> + mov %dl, (%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill2):
> + mov %dx, (%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill3):
> + mov %edx, -1(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill4):
> + mov %edx, (%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill5):
> + mov %edx, (%rdi)
> + mov %dl, 4(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill6):
> + mov %edx, (%rdi)
> + mov %dx, 4(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill7):
> + mov %rdx, -1(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill8):
> + mov %rdx, (%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill9):
> + mov %rdx, (%rdi)
> + mov %dl, 8(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill10):
> + mov %rdx, (%rdi)
> + mov %dx, 8(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill11):
> + mov %rdx, (%rdi)
> + mov %edx, 7(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill12):
> + mov %rdx, (%rdi)
> + mov %edx, 8(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill13):
> + mov %rdx, (%rdi)
> + mov %rdx, 5(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill14):
> + mov %rdx, (%rdi)
> + mov %rdx, 6(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill15):
> + movdqu %xmm0, -1(%rdi)
> + ret
> +
> + .p2align 4
> +L(Fill16):
> + movdqu %xmm0, (%rdi)
> + ret
> +
> + .p2align 4
> +L(CopyFrom1To16BytesUnalignedXmm2):
> + movdqu %xmm2, (%rdi, %rcx)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesXmmExit):
> + bsf %rdx, %rdx
> + add $15, %r8
> + add %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> + lea (%rdi, %rdx), %rax
> +# endif
> + sub %rdx, %r8
> + lea 1(%rdi, %rdx), %rdi
> +
> + .p2align 4
> +L(StrncpyFillTailWithZero):
> + pxor %xmm0, %xmm0
> + xor %rdx, %rdx
> + sub $16, %r8
> + jbe L(StrncpyFillExit)
> +
> + movdqu %xmm0, (%rdi)
> + add $16, %rdi
> +
> + mov %rdi, %rsi
> + and $0xf, %rsi
> + sub %rsi, %rdi
> + add %rsi, %r8
> + sub $64, %r8
> + jb L(StrncpyFillLess64)
> +
> +L(StrncpyFillLoopMovdqa):
> + movdqa %xmm0, (%rdi)
> + movdqa %xmm0, 16(%rdi)
> + movdqa %xmm0, 32(%rdi)
> + movdqa %xmm0, 48(%rdi)
> + add $64, %rdi
> + sub $64, %r8
> + jae L(StrncpyFillLoopMovdqa)
> +
> +L(StrncpyFillLess64):
> + add $32, %r8
> + jl L(StrncpyFillLess32)
> + movdqa %xmm0, (%rdi)
> + movdqa %xmm0, 16(%rdi)
> + add $32, %rdi
> + sub $16, %r8
> + jl L(StrncpyFillExit)
> + movdqa %xmm0, (%rdi)
> + add $16, %rdi
> + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> +
> +L(StrncpyFillLess32):
> + add $16, %r8
> + jl L(StrncpyFillExit)
> + movdqa %xmm0, (%rdi)
> + add $16, %rdi
> + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> +
> +L(StrncpyFillExit):
> + add $16, %r8
> + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> +
> +/* end of ifndef USE_AS_STRCAT */
> +# endif
> +
> + .p2align 4
> +L(UnalignedLeaveCase2OrCase3):
> + test %rdx, %rdx
> + jnz L(Unaligned64LeaveCase2)
> +L(Unaligned64LeaveCase3):
> + lea 64(%r8), %rcx
> + and $-16, %rcx
> + add $48, %r8
> + jl L(CopyFrom1To16BytesCase3)
> + movdqu %xmm4, (%rdi)
> + sub $16, %r8
> + jb L(CopyFrom1To16BytesCase3)
> + movdqu %xmm5, 16(%rdi)
> + sub $16, %r8
> + jb L(CopyFrom1To16BytesCase3)
> + movdqu %xmm6, 32(%rdi)
> + sub $16, %r8
> + jb L(CopyFrom1To16BytesCase3)
> + movdqu %xmm7, 48(%rdi)
> +# ifdef USE_AS_STPCPY
> + lea 64(%rdi), %rax
> +# endif
> +# ifdef USE_AS_STRCAT
> + xor %ch, %ch
> + movb %ch, 64(%rdi)
> +# endif
> + ret
> +
> + .p2align 4
> +L(Unaligned64LeaveCase2):
> + xor %rcx, %rcx
> + pcmpeqb %xmm4, %xmm0
> + pmovmskb %xmm0, %rdx
> + add $48, %r8
> + jle L(CopyFrom1To16BytesCase2OrCase3)
> + test %rdx, %rdx
> +# ifndef USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm4)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> + pcmpeqb %xmm5, %xmm0
> + pmovmskb %xmm0, %rdx
> + movdqu %xmm4, (%rdi)
> + add $16, %rcx
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> + test %rdx, %rdx
> +# ifndef USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm5)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + pcmpeqb %xmm6, %xmm0
> + pmovmskb %xmm0, %rdx
> + movdqu %xmm5, 16(%rdi)
> + add $16, %rcx
> + sub $16, %r8
> + jbe L(CopyFrom1To16BytesCase2OrCase3)
> + test %rdx, %rdx
> +# ifndef USE_AS_STRCAT
> + jnz L(CopyFrom1To16BytesUnalignedXmm6)
> +# else
> + jnz L(CopyFrom1To16Bytes)
> +# endif
> +
> + pcmpeqb %xmm7, %xmm0
> + pmovmskb %xmm0, %rdx
> + movdqu %xmm6, 32(%rdi)
> + lea 16(%rdi, %rcx), %rdi
> + lea 16(%rsi, %rcx), %rsi
> + bsf %rdx, %rdx
> + cmp %r8, %rdx
> + jb L(CopyFrom1To16BytesExit)
> + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> +
> + .p2align 4
> +L(ExitZero):
> +# ifndef USE_AS_STRCAT
> + mov %rdi, %rax
> +# endif
> + ret
> +
> +# endif
> +
> +# ifndef USE_AS_STRCAT
> +END (STRCPY)
> +# else
> +END (STRCAT)
> +# endif
> + .p2align 4
> + .section .rodata
> +L(ExitTable):
> + .int JMPTBL(L(Exit1), L(ExitTable))
> + .int JMPTBL(L(Exit2), L(ExitTable))
> + .int JMPTBL(L(Exit3), L(ExitTable))
> + .int JMPTBL(L(Exit4), L(ExitTable))
> + .int JMPTBL(L(Exit5), L(ExitTable))
> + .int JMPTBL(L(Exit6), L(ExitTable))
> + .int JMPTBL(L(Exit7), L(ExitTable))
> + .int JMPTBL(L(Exit8), L(ExitTable))
> + .int JMPTBL(L(Exit9), L(ExitTable))
> + .int JMPTBL(L(Exit10), L(ExitTable))
> + .int JMPTBL(L(Exit11), L(ExitTable))
> + .int JMPTBL(L(Exit12), L(ExitTable))
> + .int JMPTBL(L(Exit13), L(ExitTable))
> + .int JMPTBL(L(Exit14), L(ExitTable))
> + .int JMPTBL(L(Exit15), L(ExitTable))
> + .int JMPTBL(L(Exit16), L(ExitTable))
> + .int JMPTBL(L(Exit17), L(ExitTable))
> + .int JMPTBL(L(Exit18), L(ExitTable))
> + .int JMPTBL(L(Exit19), L(ExitTable))
> + .int JMPTBL(L(Exit20), L(ExitTable))
> + .int JMPTBL(L(Exit21), L(ExitTable))
> + .int JMPTBL(L(Exit22), L(ExitTable))
> + .int JMPTBL(L(Exit23), L(ExitTable))
> + .int JMPTBL(L(Exit24), L(ExitTable))
> + .int JMPTBL(L(Exit25), L(ExitTable))
> + .int JMPTBL(L(Exit26), L(ExitTable))
> + .int JMPTBL(L(Exit27), L(ExitTable))
> + .int JMPTBL(L(Exit28), L(ExitTable))
> + .int JMPTBL(L(Exit29), L(ExitTable))
> + .int JMPTBL(L(Exit30), L(ExitTable))
> + .int JMPTBL(L(Exit31), L(ExitTable))
> + .int JMPTBL(L(Exit32), L(ExitTable))
> +# ifdef USE_AS_STRNCPY
> +L(ExitStrncpyTable):
> + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
> + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
> +# ifndef USE_AS_STRCAT
> + .p2align 4
> +L(FillTable):
> + .int JMPTBL(L(Fill0), L(FillTable))
> + .int JMPTBL(L(Fill1), L(FillTable))
> + .int JMPTBL(L(Fill2), L(FillTable))
> + .int JMPTBL(L(Fill3), L(FillTable))
> + .int JMPTBL(L(Fill4), L(FillTable))
> + .int JMPTBL(L(Fill5), L(FillTable))
> + .int JMPTBL(L(Fill6), L(FillTable))
> + .int JMPTBL(L(Fill7), L(FillTable))
> + .int JMPTBL(L(Fill8), L(FillTable))
> + .int JMPTBL(L(Fill9), L(FillTable))
> + .int JMPTBL(L(Fill10), L(FillTable))
> + .int JMPTBL(L(Fill11), L(FillTable))
> + .int JMPTBL(L(Fill12), L(FillTable))
> + .int JMPTBL(L(Fill13), L(FillTable))
> + .int JMPTBL(L(Fill14), L(FillTable))
> + .int JMPTBL(L(Fill15), L(FillTable))
> + .int JMPTBL(L(Fill16), L(FillTable))
> +# endif
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
> deleted file mode 100644
> index 7710173..0000000
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
> +++ /dev/null
> @@ -1,1887 +0,0 @@
> -/* strcpy with SSE2 and unaligned load
> - Copyright (C) 2011-2013 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#ifndef NOT_IN_libc
> -
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_sse2_unaligned
> -# endif
> -
> -# endif
> -
> -# define JMPTBL(I, B) I - B
> -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), %rcx; \
> - lea (%r11, %rcx), %rcx; \
> - jmp *%rcx
> -
> -# ifndef USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCPY)
> -# ifdef USE_AS_STRNCPY
> - mov %rdx, %r8
> - test %r8, %r8
> - jz L(ExitZero)
> -# endif
> - mov %rsi, %rcx
> -# ifndef USE_AS_STPCPY
> - mov %rdi, %rax /* save result */
> -# endif
> -
> -# endif
> -
> - and $63, %rcx
> - cmp $32, %rcx
> - jbe L(SourceStringAlignmentLess32)
> -
> - and $-16, %rsi
> - and $15, %rcx
> - pxor %xmm0, %xmm0
> - pxor %xmm1, %xmm1
> -
> - pcmpeqb (%rsi), %xmm1
> - pmovmskb %xmm1, %rdx
> - shr %cl, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - mov $16, %r10
> - sub %rcx, %r10
> - cmp %r10, %r8
> -# else
> - mov $17, %r10
> - sub %rcx, %r10
> - cmp %r10, %r8
> -# endif
> - jbe L(CopyFrom1To16BytesTailCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesTail)
> -
> - pcmpeqb 16(%rsi), %xmm0
> - pmovmskb %xmm0, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> - add $16, %r10
> - cmp %r10, %r8
> - jbe L(CopyFrom1To32BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jnz L(CopyFrom1To32Bytes)
> -
> - movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
> - movdqu %xmm1, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> - .p2align 4
> -L(Unalign16Both):
> - sub %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> - add %rcx, %r8
> -# endif
> - mov $16, %rcx
> - movdqa (%rsi, %rcx), %xmm1
> - movaps 16(%rsi, %rcx), %xmm2
> - movdqu %xmm1, (%rdi, %rcx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $48, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm2)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movaps 16(%rsi, %rcx), %xmm3
> - movdqu %xmm2, (%rdi, %rcx)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm3)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movaps 16(%rsi, %rcx), %xmm4
> - movdqu %xmm3, (%rdi, %rcx)
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm4)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movaps 16(%rsi, %rcx), %xmm1
> - movdqu %xmm4, (%rdi, %rcx)
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm1)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movaps 16(%rsi, %rcx), %xmm2
> - movdqu %xmm1, (%rdi, %rcx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm2)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movaps 16(%rsi, %rcx), %xmm3
> - movdqu %xmm2, (%rdi, %rcx)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $16, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm3)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - movdqu %xmm3, (%rdi, %rcx)
> - mov %rsi, %rdx
> - lea 16(%rsi, %rcx), %rsi
> - and $-0x40, %rsi
> - sub %rsi, %rdx
> - sub %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> - lea 128(%r8, %rdx), %r8
> -# endif
> -L(Unaligned64Loop):
> - movaps (%rsi), %xmm2
> - movaps %xmm2, %xmm4
> - movaps 16(%rsi), %xmm5
> - movaps 32(%rsi), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 48(%rsi), %xmm7
> - pminub %xmm5, %xmm2
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rdx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(UnalignedLeaveCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jnz L(Unaligned64Leave)
> -
> -L(Unaligned64Loop_start):
> - add $64, %rdi
> - add $64, %rsi
> - movdqu %xmm4, -64(%rdi)
> - movaps (%rsi), %xmm2
> - movdqa %xmm2, %xmm4
> - movdqu %xmm5, -48(%rdi)
> - movaps 16(%rsi), %xmm5
> - pminub %xmm5, %xmm2
> - movaps 32(%rsi), %xmm3
> - movdqu %xmm6, -32(%rdi)
> - movaps %xmm3, %xmm6
> - movdqu %xmm7, -16(%rdi)
> - movaps 48(%rsi), %xmm7
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rdx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(UnalignedLeaveCase2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jz L(Unaligned64Loop_start)
> -
> -L(Unaligned64Leave):
> - pxor %xmm1, %xmm1
> -
> - pcmpeqb %xmm4, %xmm0
> - pcmpeqb %xmm5, %xmm1
> - pmovmskb %xmm0, %rdx
> - pmovmskb %xmm1, %rcx
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesUnaligned_0)
> - test %rcx, %rcx
> - jnz L(CopyFrom1To16BytesUnaligned_16)
> -
> - pcmpeqb %xmm6, %xmm0
> - pcmpeqb %xmm7, %xmm1
> - pmovmskb %xmm0, %rdx
> - pmovmskb %xmm1, %rcx
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesUnaligned_32)
> -
> - bsf %rcx, %rdx
> - movdqu %xmm4, (%rdi)
> - movdqu %xmm5, 16(%rdi)
> - movdqu %xmm6, 32(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea 48(%rdi, %rdx), %rax
> -# endif
> - movdqu %xmm7, 48(%rdi)
> - add $15, %r8
> - sub %rdx, %r8
> - lea 49(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - add $48, %rsi
> - add $48, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -# endif
> -
> -/* If source address alignment == destination address alignment */
> -
> -L(SourceStringAlignmentLess32):
> - pxor %xmm0, %xmm0
> - movdqu (%rsi), %xmm1
> - movdqu 16(%rsi), %xmm2
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - cmp $16, %r8
> -# else
> - cmp $17, %r8
> -# endif
> - jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesTail1)
> -
> - pcmpeqb %xmm2, %xmm0
> - movdqu %xmm1, (%rdi)
> - pmovmskb %xmm0, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - cmp $32, %r8
> -# else
> - cmp $33, %r8
> -# endif
> - jbe L(CopyFrom1To32Bytes1Case2OrCase3)
> -# endif
> - test %rdx, %rdx
> - jnz L(CopyFrom1To32Bytes1)
> -
> - and $-16, %rsi
> - and $15, %rcx
> - jmp L(Unalign16Both)
> -
> -/*------End of main part with loops---------------------*/
> -
> -/* Case1 */
> -
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> - add %rcx, %rdi
> - add %rcx, %rsi
> - bsf %rdx, %rdx
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -# endif
> - .p2align 4
> -L(CopyFrom1To16BytesTail):
> - add %rcx, %rsi
> - bsf %rdx, %rdx
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -
> - .p2align 4
> -L(CopyFrom1To32Bytes1):
> - add $16, %rsi
> - add $16, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $16, %r8
> -# endif
> -L(CopyFrom1To16BytesTail1):
> - bsf %rdx, %rdx
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -
> - .p2align 4
> -L(CopyFrom1To32Bytes):
> - bsf %rdx, %rdx
> - add %rcx, %rsi
> - add $16, %rdx
> - sub %rcx, %rdx
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnaligned_0):
> - bsf %rdx, %rdx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> - movdqu %xmm4, (%rdi)
> - add $63, %r8
> - sub %rdx, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -# endif
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnaligned_16):
> - bsf %rcx, %rdx
> - movdqu %xmm4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea 16(%rdi, %rdx), %rax
> -# endif
> - movdqu %xmm5, 16(%rdi)
> - add $47, %r8
> - sub %rdx, %r8
> - lea 17(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - add $16, %rsi
> - add $16, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -# endif
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnaligned_32):
> - bsf %rdx, %rdx
> - movdqu %xmm4, (%rdi)
> - movdqu %xmm5, 16(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea 32(%rdi, %rdx), %rax
> -# endif
> - movdqu %xmm6, 32(%rdi)
> - add $31, %r8
> - sub %rdx, %r8
> - lea 33(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - add $32, %rsi
> - add $32, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -# ifndef USE_AS_STRCAT
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm6):
> - movdqu %xmm6, (%rdi, %rcx)
> - jmp L(CopyFrom1To16BytesXmmExit)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm5):
> - movdqu %xmm5, (%rdi, %rcx)
> - jmp L(CopyFrom1To16BytesXmmExit)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm4):
> - movdqu %xmm4, (%rdi, %rcx)
> - jmp L(CopyFrom1To16BytesXmmExit)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm3):
> - movdqu %xmm3, (%rdi, %rcx)
> - jmp L(CopyFrom1To16BytesXmmExit)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm1):
> - movdqu %xmm1, (%rdi, %rcx)
> - jmp L(CopyFrom1To16BytesXmmExit)
> -# endif
> -
> - .p2align 4
> -L(CopyFrom1To16BytesExit):
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
> -
> -/* Case2 */
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rcx, %rdi
> - add %rcx, %rsi
> - bsf %rdx, %rdx
> - cmp %r8, %rdx
> - jb L(CopyFrom1To16BytesExit)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> - .p2align 4
> -L(CopyFrom1To32BytesCase2):
> - add %rcx, %rsi
> - bsf %rdx, %rdx
> - add $16, %rdx
> - sub %rcx, %rdx
> - cmp %r8, %rdx
> - jb L(CopyFrom1To16BytesExit)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> -L(CopyFrom1To16BytesTailCase2):
> - add %rcx, %rsi
> - bsf %rdx, %rdx
> - cmp %r8, %rdx
> - jb L(CopyFrom1To16BytesExit)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> -L(CopyFrom1To16BytesTail1Case2):
> - bsf %rdx, %rdx
> - cmp %r8, %rdx
> - jb L(CopyFrom1To16BytesExit)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> -/* Case2 or Case3, Case3 */
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesCase2)
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rcx, %rdi
> - add %rcx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> - .p2align 4
> -L(CopyFrom1To32BytesCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyFrom1To32BytesCase2)
> - add %rcx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesTailCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesTailCase2)
> - add %rcx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> - .p2align 4
> -L(CopyFrom1To32Bytes1Case2OrCase3):
> - add $16, %rdi
> - add $16, %rsi
> - sub $16, %r8
> -L(CopyFrom1To16BytesTail1Case2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyFrom1To16BytesTail1Case2)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> -# endif
> -
> -/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
> -
> - .p2align 4
> -L(Exit1):
> - mov %dh, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $1, %r8
> - lea 1(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - mov (%rsi), %dx
> - mov %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $2, %r8
> - lea 2(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - mov (%rsi), %cx
> - mov %cx, (%rdi)
> - mov %dh, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $3, %r8
> - lea 3(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $4, %r8
> - lea 4(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit5):
> - mov (%rsi), %ecx
> - mov %dh, 4(%rdi)
> - mov %ecx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $5, %r8
> - lea 5(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - mov (%rsi), %ecx
> - mov 4(%rsi), %dx
> - mov %ecx, (%rdi)
> - mov %dx, 4(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $6, %r8
> - lea 6(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - mov (%rsi), %ecx
> - mov 3(%rsi), %edx
> - mov %ecx, (%rdi)
> - mov %edx, 3(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $7, %r8
> - lea 7(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit8):
> - mov (%rsi), %rdx
> - mov %rdx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $8, %r8
> - lea 8(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit9):
> - mov (%rsi), %rcx
> - mov %dh, 8(%rdi)
> - mov %rcx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $9, %r8
> - lea 9(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - mov (%rsi), %rcx
> - mov 8(%rsi), %dx
> - mov %rcx, (%rdi)
> - mov %dx, 8(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $10, %r8
> - lea 10(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - mov (%rsi), %rcx
> - mov 7(%rsi), %edx
> - mov %rcx, (%rdi)
> - mov %edx, 7(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $11, %r8
> - lea 11(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - mov (%rsi), %rcx
> - mov 8(%rsi), %edx
> - mov %rcx, (%rdi)
> - mov %edx, 8(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $12, %r8
> - lea 12(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit13):
> - mov (%rsi), %rcx
> - mov 5(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 5(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $13, %r8
> - lea 13(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - mov (%rsi), %rcx
> - mov 6(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 6(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $14, %r8
> - lea 14(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - mov (%rsi), %rcx
> - mov 7(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 7(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $15, %r8
> - lea 15(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit16):
> - movdqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $16, %r8
> - lea 16(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit17):
> - movdqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> - mov %dh, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 16(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $17, %r8
> - lea 17(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit18):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %cx
> - movdqu %xmm0, (%rdi)
> - mov %cx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 17(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $18, %r8
> - lea 18(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit19):
> - movdqu (%rsi), %xmm0
> - mov 15(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 18(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $19, %r8
> - lea 19(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit20):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 19(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $20, %r8
> - lea 20(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit21):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 16(%rdi)
> - mov %dh, 20(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 20(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $21, %r8
> - lea 21(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit22):
> - movdqu (%rsi), %xmm0
> - mov 14(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 14(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 21(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $22, %r8
> - lea 22(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit23):
> - movdqu (%rsi), %xmm0
> - mov 15(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 22(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $23, %r8
> - lea 23(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit24):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 23(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $24, %r8
> - lea 24(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit25):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 16(%rdi)
> - mov %dh, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 24(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $25, %r8
> - lea 25(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit26):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 24(%rsi), %cx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %cx, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 25(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $26, %r8
> - lea 26(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit27):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 23(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %ecx, 23(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 26(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $27, %r8
> - lea 27(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit28):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 24(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %ecx, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 27(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $28, %r8
> - lea 28(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit29):
> - movdqu (%rsi), %xmm0
> - movdqu 13(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 13(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 28(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $29, %r8
> - lea 29(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit30):
> - movdqu (%rsi), %xmm0
> - movdqu 14(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 14(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 29(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $30, %r8
> - lea 30(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit31):
> - movdqu (%rsi), %xmm0
> - movdqu 15(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 30(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $31, %r8
> - lea 31(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit32):
> - movdqu (%rsi), %xmm0
> - movdqu 16(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 31(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $32, %r8
> - lea 32(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(StrncpyExit0):
> -# ifdef USE_AS_STPCPY
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, (%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit1):
> - mov (%rsi), %dl
> - mov %dl, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 1(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit2):
> - mov (%rsi), %dx
> - mov %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 2(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit3):
> - mov (%rsi), %cx
> - mov 2(%rsi), %dl
> - mov %cx, (%rdi)
> - mov %dl, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 3(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit4):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 4(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit5):
> - mov (%rsi), %ecx
> - mov 4(%rsi), %dl
> - mov %ecx, (%rdi)
> - mov %dl, 4(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 5(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit6):
> - mov (%rsi), %ecx
> - mov 4(%rsi), %dx
> - mov %ecx, (%rdi)
> - mov %dx, 4(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 6(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit7):
> - mov (%rsi), %ecx
> - mov 3(%rsi), %edx
> - mov %ecx, (%rdi)
> - mov %edx, 3(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 7(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit8):
> - mov (%rsi), %rdx
> - mov %rdx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 8(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit9):
> - mov (%rsi), %rcx
> - mov 8(%rsi), %dl
> - mov %rcx, (%rdi)
> - mov %dl, 8(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 9(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit10):
> - mov (%rsi), %rcx
> - mov 8(%rsi), %dx
> - mov %rcx, (%rdi)
> - mov %dx, 8(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 10(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit11):
> - mov (%rsi), %rcx
> - mov 7(%rsi), %edx
> - mov %rcx, (%rdi)
> - mov %edx, 7(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 11(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit12):
> - mov (%rsi), %rcx
> - mov 8(%rsi), %edx
> - mov %rcx, (%rdi)
> - mov %edx, 8(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 12(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit13):
> - mov (%rsi), %rcx
> - mov 5(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 5(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 13(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit14):
> - mov (%rsi), %rcx
> - mov 6(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 6(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 14(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit15):
> - mov (%rsi), %rcx
> - mov 7(%rsi), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, 7(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 15(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit16):
> - movdqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 16(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 16(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit17):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %cl
> - movdqu %xmm0, (%rdi)
> - mov %cl, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 17(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 17(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit18):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %cx
> - movdqu %xmm0, (%rdi)
> - mov %cx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 18(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 18(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit19):
> - movdqu (%rsi), %xmm0
> - mov 15(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 19(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 19(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit20):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 20(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 20(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit21):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %ecx
> - mov 20(%rsi), %dl
> - movdqu %xmm0, (%rdi)
> - mov %ecx, 16(%rdi)
> - mov %dl, 20(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 21(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 21(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit22):
> - movdqu (%rsi), %xmm0
> - mov 14(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 14(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 22(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 22(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit23):
> - movdqu (%rsi), %xmm0
> - mov 15(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 23(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 23(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit24):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rcx
> - movdqu %xmm0, (%rdi)
> - mov %rcx, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 24(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 24(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit25):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 24(%rsi), %cl
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %cl, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 25(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 25(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit26):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 24(%rsi), %cx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %cx, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 26(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 26(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit27):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 23(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %ecx, 23(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 27(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 27(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit28):
> - movdqu (%rsi), %xmm0
> - mov 16(%rsi), %rdx
> - mov 24(%rsi), %ecx
> - movdqu %xmm0, (%rdi)
> - mov %rdx, 16(%rdi)
> - mov %ecx, 24(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 28(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 28(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit29):
> - movdqu (%rsi), %xmm0
> - movdqu 13(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 13(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 29(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 29(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit30):
> - movdqu (%rsi), %xmm0
> - movdqu 14(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 14(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 30(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 30(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit31):
> - movdqu (%rsi), %xmm0
> - movdqu 15(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 15(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 31(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 31(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit32):
> - movdqu (%rsi), %xmm0
> - movdqu 16(%rsi), %xmm2
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 16(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 32(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 32(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit33):
> - movdqu (%rsi), %xmm0
> - movdqu 16(%rsi), %xmm2
> - mov 32(%rsi), %cl
> - movdqu %xmm0, (%rdi)
> - movdqu %xmm2, 16(%rdi)
> - mov %cl, 32(%rdi)
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 33(%rdi)
> -# endif
> - ret
> -
> -# ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(Fill0):
> - ret
> -
> - .p2align 4
> -L(Fill1):
> - mov %dl, (%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill2):
> - mov %dx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill3):
> - mov %edx, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill4):
> - mov %edx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill5):
> - mov %edx, (%rdi)
> - mov %dl, 4(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill6):
> - mov %edx, (%rdi)
> - mov %dx, 4(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill7):
> - mov %rdx, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill8):
> - mov %rdx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill9):
> - mov %rdx, (%rdi)
> - mov %dl, 8(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill10):
> - mov %rdx, (%rdi)
> - mov %dx, 8(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill11):
> - mov %rdx, (%rdi)
> - mov %edx, 7(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill12):
> - mov %rdx, (%rdi)
> - mov %edx, 8(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill13):
> - mov %rdx, (%rdi)
> - mov %rdx, 5(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill14):
> - mov %rdx, (%rdi)
> - mov %rdx, 6(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill15):
> - movdqu %xmm0, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(Fill16):
> - movdqu %xmm0, (%rdi)
> - ret
> -
> - .p2align 4
> -L(CopyFrom1To16BytesUnalignedXmm2):
> - movdqu %xmm2, (%rdi, %rcx)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesXmmExit):
> - bsf %rdx, %rdx
> - add $15, %r8
> - add %rcx, %rdi
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> - sub %rdx, %r8
> - lea 1(%rdi, %rdx), %rdi
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero):
> - pxor %xmm0, %xmm0
> - xor %rdx, %rdx
> - sub $16, %r8
> - jbe L(StrncpyFillExit)
> -
> - movdqu %xmm0, (%rdi)
> - add $16, %rdi
> -
> - mov %rdi, %rsi
> - and $0xf, %rsi
> - sub %rsi, %rdi
> - add %rsi, %r8
> - sub $64, %r8
> - jb L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm0, 16(%rdi)
> - movdqa %xmm0, 32(%rdi)
> - movdqa %xmm0, 48(%rdi)
> - add $64, %rdi
> - sub $64, %r8
> - jae L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> - add $32, %r8
> - jl L(StrncpyFillLess32)
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm0, 16(%rdi)
> - add $32, %rdi
> - sub $16, %r8
> - jl L(StrncpyFillExit)
> - movdqa %xmm0, (%rdi)
> - add $16, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> -
> -L(StrncpyFillLess32):
> - add $16, %r8
> - jl L(StrncpyFillExit)
> - movdqa %xmm0, (%rdi)
> - add $16, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> -
> -L(StrncpyFillExit):
> - add $16, %r8
> - BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
> -
> -/* end of ifndef USE_AS_STRCAT */
> -# endif
> -
> - .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(Unaligned64LeaveCase2)
> -L(Unaligned64LeaveCase3):
> - lea 64(%r8), %rcx
> - and $-16, %rcx
> - add $48, %r8
> - jl L(CopyFrom1To16BytesCase3)
> - movdqu %xmm4, (%rdi)
> - sub $16, %r8
> - jb L(CopyFrom1To16BytesCase3)
> - movdqu %xmm5, 16(%rdi)
> - sub $16, %r8
> - jb L(CopyFrom1To16BytesCase3)
> - movdqu %xmm6, 32(%rdi)
> - sub $16, %r8
> - jb L(CopyFrom1To16BytesCase3)
> - movdqu %xmm7, 48(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 64(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - xor %ch, %ch
> - movb %ch, 64(%rdi)
> -# endif
> - ret
> -
> - .p2align 4
> -L(Unaligned64LeaveCase2):
> - xor %rcx, %rcx
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rdx
> - add $48, %r8
> - jle L(CopyFrom1To16BytesCase2OrCase3)
> - test %rdx, %rdx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm4)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> - pcmpeqb %xmm5, %xmm0
> - pmovmskb %xmm0, %rdx
> - movdqu %xmm4, (%rdi)
> - add $16, %rcx
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rdx, %rdx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm5)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - pcmpeqb %xmm6, %xmm0
> - pmovmskb %xmm0, %rdx
> - movdqu %xmm5, 16(%rdi)
> - add $16, %rcx
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rdx, %rdx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyFrom1To16BytesUnalignedXmm6)
> -# else
> - jnz L(CopyFrom1To16Bytes)
> -# endif
> -
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %rdx
> - movdqu %xmm6, 32(%rdi)
> - lea 16(%rdi, %rcx), %rdi
> - lea 16(%rsi, %rcx), %rsi
> - bsf %rdx, %rdx
> - cmp %r8, %rdx
> - jb L(CopyFrom1To16BytesExit)
> - BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
> -
> - .p2align 4
> -L(ExitZero):
> -# ifndef USE_AS_STRCAT
> - mov %rdi, %rax
> -# endif
> - ret
> -
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
> -# endif
> - .p2align 4
> - .section .rodata
> -L(ExitTable):
> - .int JMPTBL(L(Exit1), L(ExitTable))
> - .int JMPTBL(L(Exit2), L(ExitTable))
> - .int JMPTBL(L(Exit3), L(ExitTable))
> - .int JMPTBL(L(Exit4), L(ExitTable))
> - .int JMPTBL(L(Exit5), L(ExitTable))
> - .int JMPTBL(L(Exit6), L(ExitTable))
> - .int JMPTBL(L(Exit7), L(ExitTable))
> - .int JMPTBL(L(Exit8), L(ExitTable))
> - .int JMPTBL(L(Exit9), L(ExitTable))
> - .int JMPTBL(L(Exit10), L(ExitTable))
> - .int JMPTBL(L(Exit11), L(ExitTable))
> - .int JMPTBL(L(Exit12), L(ExitTable))
> - .int JMPTBL(L(Exit13), L(ExitTable))
> - .int JMPTBL(L(Exit14), L(ExitTable))
> - .int JMPTBL(L(Exit15), L(ExitTable))
> - .int JMPTBL(L(Exit16), L(ExitTable))
> - .int JMPTBL(L(Exit17), L(ExitTable))
> - .int JMPTBL(L(Exit18), L(ExitTable))
> - .int JMPTBL(L(Exit19), L(ExitTable))
> - .int JMPTBL(L(Exit20), L(ExitTable))
> - .int JMPTBL(L(Exit21), L(ExitTable))
> - .int JMPTBL(L(Exit22), L(ExitTable))
> - .int JMPTBL(L(Exit23), L(ExitTable))
> - .int JMPTBL(L(Exit24), L(ExitTable))
> - .int JMPTBL(L(Exit25), L(ExitTable))
> - .int JMPTBL(L(Exit26), L(ExitTable))
> - .int JMPTBL(L(Exit27), L(ExitTable))
> - .int JMPTBL(L(Exit28), L(ExitTable))
> - .int JMPTBL(L(Exit29), L(ExitTable))
> - .int JMPTBL(L(Exit30), L(ExitTable))
> - .int JMPTBL(L(Exit31), L(ExitTable))
> - .int JMPTBL(L(Exit32), L(ExitTable))
> -# ifdef USE_AS_STRNCPY
> -L(ExitStrncpyTable):
> - .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
> - .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
> -# ifndef USE_AS_STRCAT
> - .p2align 4
> -L(FillTable):
> - .int JMPTBL(L(Fill0), L(FillTable))
> - .int JMPTBL(L(Fill1), L(FillTable))
> - .int JMPTBL(L(Fill2), L(FillTable))
> - .int JMPTBL(L(Fill3), L(FillTable))
> - .int JMPTBL(L(Fill4), L(FillTable))
> - .int JMPTBL(L(Fill5), L(FillTable))
> - .int JMPTBL(L(Fill6), L(FillTable))
> - .int JMPTBL(L(Fill7), L(FillTable))
> - .int JMPTBL(L(Fill8), L(FillTable))
> - .int JMPTBL(L(Fill9), L(FillTable))
> - .int JMPTBL(L(Fill10), L(FillTable))
> - .int JMPTBL(L(Fill11), L(FillTable))
> - .int JMPTBL(L(Fill12), L(FillTable))
> - .int JMPTBL(L(Fill13), L(FillTable))
> - .int JMPTBL(L(Fill14), L(FillTable))
> - .int JMPTBL(L(Fill15), L(FillTable))
> - .int JMPTBL(L(Fill16), L(FillTable))
> -# endif
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
> index fcc23a7..43c1f97 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
> @@ -1,3 +1,3 @@
> #define USE_AS_STRNCPY
> #define STRCPY __strncpy_sse2_unaligned
> -#include "strcpy-sse2-unaligned.S"
> +#include "strcpy-sse2-unaligned-old.S"
> --
> 1.8.3.2
>