This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.14-44-g8912479
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 24 Jun 2011 19:15:10 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.14-44-g8912479
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 8912479f9ea9f56dc188d3d00c4ba4259f600661 (commit)
via d5495a116c6271c0ae8f6955b64b7b010b1b341a (commit)
via 0b1cbaaef5ccc21baf2c35d4698fb28e82eab385 (commit)
via 07f494a027b3adea1f3cd0cd4ca7c10949cdc476 (commit)
from fa3fc0fe5f452d0aa7e435d8f32e992958683819 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8912479f9ea9f56dc188d3d00c4ba4259f600661
commit 8912479f9ea9f56dc188d3d00c4ba4259f600661
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Fri Jun 24 15:14:22 2011 -0400
Improved st{r,p}{,n}cpy for SSE2 and SSSE3 on x86-64
diff --git a/ChangeLog b/ChangeLog
index 8bf8eeb..b950dcc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2011-06-22 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+ strcpy-sse2-unaligned strncpy-sse2-unaligned
+ stpcpy-sse2-unaligned stpncpy-sse2-unaligned.
+ * sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/stpcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/stpncpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S: New file.
+ * sysdeps/x86_64/multiarch/strncpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/strcpy.S: Remove strcpy with SSSE3.
+ (STRCPY): Support SSE2 and SSSE3 versions.
+
2011-06-24 Ulrich Drepper <drepper@gmail.com>
[BZ #12874]
diff --git a/NEWS b/NEWS
index dd28004..bc77d2d 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,9 @@ Version 2.15
* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
Contributed by HJ Lu.
+
+* Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64.
+ Contributed by HJ Lu.
Version 2.14
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 19aa4be..88410b3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,12 +4,15 @@ gen-as-const-headers += ifunc-defines.sym
endif
ifeq ($(subdir),string)
+
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 strlen-sse4 strlen-no-bsf \
- memset-x86-64
+ strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \
+ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+ strcpy-sse2-unaligned strncpy-sse2-unaligned \
+ stpcpy-sse2-unaligned stpncpy-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
new file mode 100644
index 0000000..34231f8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000..d971c2d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
new file mode 100644
index 0000000..658520f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000..14ed16f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
new file mode 100644
index 0000000..9a8d186
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -0,0 +1,1718 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2_unaligned
+# endif
+
+# define JMPTBL(I, B) I - B
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+ .text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+ and $15, %rcx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %rsi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+# ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+# endif
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%rdi)
+
+ sub %rcx, %rdi
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $48, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm4
+ movdqu %xmm3, (%rdi, %rcx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm1
+ movdqu %xmm4, (%rdi, %rcx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rdx, %rdx
+# if defined USE_AS_STRNCPY
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+# else
+ jnz L(CopyFrom1To16Bytes)
+# endif
+
+ movdqu %xmm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea 16(%rsi, %rcx), %rsi
+ and $-0x40, %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+# ifdef USE_AS_STRNCPY
+ lea 128(%r8, %rdx), %r8
+# endif
+L(Unaligned64Loop):
+ movaps (%rsi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rsi), %xmm5
+ movaps 32(%rsi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rsi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %rdi
+ add $64, %rsi
+ movdqu %xmm4, -64(%rdi)
+ movaps (%rsi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%rdi)
+ movaps 16(%rsi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%rsi), %xmm3
+ movdqu %xmm6, -32(%rdi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%rdi)
+ movaps 48(%rsi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+# endif
+ test %rdx, %rdx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %rcx, %rcx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, 32(%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm7, 48(%rdi)
+ add $15, %r8
+ sub %rdx, %r8
+ lea 49(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $48, %rsi
+ add $48, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%rsi), %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+
+# ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+ jmp L(Unalign16Both)
+
+/* ------End of main part with loops--------------------- */
+
+/* Case1 */
+
+# if (!defined USE_AS_STRNCPY)
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+# if defined USE_AS_STRNCPY
+ sub %rcx, %r8
+# endif
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+# if defined USE_AS_STRNCPY
+ sub $16, %r8
+# endif
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+# if defined USE_AS_STRNCPY
+ sub %rcx, %r8
+# endif
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %rdx, %rdx
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ movdqu %xmm4, (%rdi)
+ add $63, %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm5, 16(%rdi)
+ add $47, %r8
+ sub %rdx, %r8
+ lea 17(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $16, %rsi
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %rdx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+# if defined USE_AS_STRNCPY
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm6, 32(%rdi)
+ add $31, %r8
+ sub %rdx, %r8
+ lea 33(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+# else
+ add $32, %rsi
+ add $32, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+# endif
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %rcx, %r8
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %rcx, %r8
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %rcx, %r8
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %rcx, %r8
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+# endif
+
+/* ----End labels regarding with copying 1-16 bytes--and 1-32 bytes---- */
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+# if defined USE_AS_STRNCPY
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+# ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+ ret
+
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ ret
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ ret
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%rdi)
+ ret
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%rdi)
+ ret
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add $15, %r8
+ add %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%rdi)
+ add $16, %rdi
+
+ mov %rdi, %rsi
+ and $0xf, %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ movdqa %xmm0, 32(%rdi)
+ movdqa %xmm0, 48(%rdi)
+ add $64, %rdi
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ add $32, %rdi
+ sub $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $16, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%r8), %rcx
+ and $-16, %rcx
+ add $48, %r8
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%rdi)
+# ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+# endif
+ ret
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %rcx, %rcx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm4, (%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm5, 16(%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm6, 32(%rdi)
+ lea 16(%rdi, %rcx), %rdi
+ lea 16(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+ mov %rdi, %rax
+ ret
+
+# endif
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+# ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000..efbd3bf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -0,0 +1,3721 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCPY)
+ mov %rsi, %rcx
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+# endif
+ mov %rdi, %rdx
+# ifdef USE_AS_STRNCPY
+ test %r8, %r8
+ jz L(Exit0)
+ cmp $8, %r8
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ cmpb $0, 7(%rcx)
+ jz L(Exit8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ cmpb $0, 14(%rcx)
+ jz L(Exit15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %r8
+ je L(Exit16)
+# endif
+ cmpb $0, 15(%rcx)
+ jz L(Exit16)
+
+# ifdef USE_AS_STRNCPY
+ mov %rcx, %rsi
+ and $0xf, %rsi
+
+/* add 16 bytes rcx_shift to r8 */
+
+ add %rsi, %r8
+# endif
+ lea 16(%rcx), %rsi
+/* Now:
+ rsi = alignment_16(rcx) + rcx_shift + 16;
+ rcx_shift = rcx - alignment_16(rcx)
+*/
+ and $-16, %rsi
+/* Now:
+ rsi = alignment_16(rcx) + 16
+*/
+ pxor %xmm0, %xmm0
+ mov (%rcx), %r9
+ mov %r9, (%rdx)
+/*
+ look if there is zero symbol in next 16 bytes of string
+ from rsi to rsi + 15 and form mask in xmm0
+*/
+ pcmpeqb (%rsi), %xmm0
+ mov 8(%rcx), %r9
+ mov %r9, 8(%rdx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+ pmovmskb %xmm0, %rax
+ sub %rcx, %rsi
+
+/* rsi = 16 - rcx_shift */
+
+/* rax = 0: there isn't end of string from position rsi to rsi+15 */
+
+# ifdef USE_AS_STRNCPY
+ sub $32, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %rdx, %rax
+ lea 16(%rdx), %rdx
+/* Now:
+ rdx = rdx + 16 = alignment_16(rdx) + rdx_shift + 16
+*/
+ and $-16, %rdx
+
+/* Now: rdx = alignment_16(rdx) + 16 */
+
+ sub %rdx, %rax
+
+/* Now: rax = rdx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+ add %rax, %rsi
+ lea -1(%rsi), %rsi
+ and $1<<31, %esi
+ test %rsi, %rsi
+ jnz L(ContinueCopy)
+ lea 16(%r8), %r8
+
+L(ContinueCopy):
+# endif
+ sub %rax, %rcx
+/* Now:
+ case rcx_shift >= rdx_shift:
+ rcx = alignment_16(rcx) + (rcx_shift - rdx_shift) + 16
+ case rcx_shift < rdx_shift:
+ rcx = alignment_16(rcx) + (16 + rcx_shift - rdx_shift)
+*/
+ mov %rcx, %rax
+ and $0xf, %rax
+/* Now:
+ case rcx_shift >= rdx_shift: rax = rcx_shift - rdx_shift
+ case rcx_shift < rdx_shift: rax = (16 + rcx_shift - rdx_shift)
+ rax can be 0, 1, ..., 15
+*/
+ mov $0, %rsi
+
+/* case: rcx_shift == rdx_shift */
+
+ jz L(Align16Both)
+
+ cmp $8, %rax
+ jae L(ShlHigh8)
+ cmp $1, %rax
+ je L(Shl1)
+ cmp $2, %rax
+ je L(Shl2)
+ cmp $3, %rax
+ je L(Shl3)
+ cmp $4, %rax
+ je L(Shl4)
+ cmp $5, %rax
+ je L(Shl5)
+ cmp $6, %rax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %rax
+ je L(Shl9)
+ cmp $10, %rax
+ je L(Shl10)
+ cmp $11, %rax
+ je L(Shl11)
+ cmp $12, %rax
+ je L(Shl12)
+ cmp $13, %rax
+ je L(Shl13)
+ cmp $14, %rax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%rcx), %xmm1
+ movaps 16(%rcx), %xmm2
+ movaps %xmm1, (%rdx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm4
+ movaps %xmm3, (%rdx, %rsi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm1
+ movaps %xmm4, (%rdx, %rsi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm2
+ movaps %xmm1, (%rdx, %rsi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%rcx, %rsi), %xmm3
+ movaps %xmm2, (%rdx, %rsi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%rdx, %rsi)
+ mov %rcx, %rax
+ lea 16(%rcx, %rsi), %rcx
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ lea 48+64(%r8, %rax), %r8
+# endif
+ mov $-0x40, %rsi
+
+L(Aligned64Loop):
+ movaps (%rcx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rcx), %xmm5
+ movaps 32(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rcx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rax
+ lea 64(%rdx), %rdx
+ lea 64(%rcx), %rcx
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%rdx)
+ movaps %xmm5, -48(%rdx)
+ movaps %xmm6, -32(%rdx)
+ movaps %xmm7, -16(%rdx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%r8), %r8
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ test %rax, %rax
+ lea 16(%rsi), %rsi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%rdx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%r8), %r8
+# endif
+ pmovmskb %xmm0, %rax
+ lea 16(%rsi), %rsi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%rcx), %xmm1
+ movaps 15(%rcx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 31(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -15(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -1(%rcx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%rcx), %xmm2
+ movaps 31(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %rsi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%rcx), %xmm1
+ movaps 14(%rcx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 30(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -14(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -2(%rcx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%rcx), %xmm2
+ movaps 30(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %rsi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%rcx), %xmm1
+ movaps 13(%rcx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 29(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -13(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -3(%rcx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%rcx), %xmm2
+ movaps 29(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %rsi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%rcx), %xmm1
+ movaps 12(%rcx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 28(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -12(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%rcx), %xmm2
+ movaps 28(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %rsi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%rcx), %xmm1
+ movaps 11(%rcx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 27(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -11(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -5(%rcx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%rcx), %xmm2
+ movaps 27(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %rsi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%rcx), %xmm1
+ movaps 10(%rcx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 26(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -10(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -6(%rcx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%rcx), %xmm2
+ movaps 26(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %rsi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%rcx), %xmm1
+ movaps 9(%rcx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 25(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -9(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -7(%rcx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%rcx), %xmm2
+ movaps 25(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %rsi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%rcx), %xmm1
+ movaps 8(%rcx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 24(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -8(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%rcx), %xmm2
+ movaps 24(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %rsi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%rcx), %xmm1
+ movaps 7(%rcx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 23(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -7(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -9(%rcx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%rcx), %xmm2
+ movaps 23(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %rsi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%rcx), %xmm1
+ movaps 6(%rcx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 22(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -6(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -10(%rcx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%rcx), %xmm2
+ movaps 22(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %rsi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%rcx), %xmm1
+ movaps 5(%rcx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 21(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -5(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -11(%rcx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%rcx), %xmm2
+ movaps 21(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %rsi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%rcx), %xmm1
+ movaps 4(%rcx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 20(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -4(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%rcx), %xmm2
+ movaps 20(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %rsi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%rcx), %xmm1
+ movaps 3(%rcx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 19(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -3(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -13(%rcx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%rcx), %xmm2
+ movaps 19(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %rsi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%rcx), %xmm1
+ movaps 2(%rcx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 18(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -2(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -14(%rcx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%rcx), %xmm2
+ movaps 18(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %rsi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%rcx), %xmm1
+ movaps 1(%rcx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%rdx), %rdx
+ pmovmskb %xmm0, %rax
+ lea 16(%rcx), %rcx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %rax, %rax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ lea 17(%rcx), %rcx
+ lea 16(%rdx), %rdx
+
+ mov %rcx, %rax
+ and $-0x40, %rcx
+ sub %rcx, %rax
+ lea -1(%rcx), %rcx
+ sub %rax, %rdx
+# ifdef USE_AS_STRNCPY
+ add %rax, %r8
+# endif
+ movaps -15(%rcx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%rcx), %xmm2
+ movaps 17(%rcx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%rcx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%rcx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %rax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %rax, %rax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%rcx), %rcx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ movaps %xmm4, 32(%rdx)
+ movaps %xmm3, 16(%rdx)
+ movaps %xmm2, (%rdx)
+ lea 64(%rdx), %rdx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movaps (%rdx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %rsi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+# ifdef USE_AS_STRCAT
+ jmp L(CopyFrom1To16Bytes)
+# endif
+
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %r8
+# endif
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %r8
+ lea 8(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %rax
+ mov %rax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 15(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ lea 16(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rsi, %rcx
+ lea (%rsi, %rdx), %rsi
+ lea -9(%r8), %rdx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%rsi), %rdx
+ jz L(ExitHighCase2)
+
+ cmp $1, %r8
+ je L(Exit1)
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ test $0x40, %al
+ jnz L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $9, %r8
+ je L(Exit9)
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $15, %r8
+ je L(Exit15)
+ test $0x40, %ah
+ jnz L(Exit15)
+ jmp L(Exit16)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rsi, %rdx
+ add %rsi, %rcx
+
+ cmp $16, %r8
+ je L(Exit16)
+ cmp $8, %r8
+ je L(Exit8)
+ jg L(More8Case3)
+ cmp $4, %r8
+ je L(Exit4)
+ jg L(More4Case3)
+ cmp $2, %r8
+ jl L(Exit1)
+ je L(Exit2)
+ jg L(Exit3)
+L(More8Case3): /* but less than 16 */
+ cmp $12, %r8
+ je L(Exit12)
+ jl L(Less12Case3)
+ cmp $14, %r8
+ jl L(Exit13)
+ je L(Exit14)
+ jg L(Exit15)
+L(More4Case3): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Exit5)
+ je L(Exit6)
+ jg L(Exit7)
+L(Less12Case3): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Exit9)
+ je L(Exit10)
+ jg L(Exit11)
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%rcx), %al
+ movb %al, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea (%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %r8
+ lea 1(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit2):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 1(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %r8
+ lea 2(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit3):
+ movw (%rcx), %ax
+ movw %ax, (%rdx)
+ movb 2(%rcx), %al
+ movb %al, 2(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 2(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %r8
+ lea 3(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit4):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 3(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %r8
+ lea 4(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit5):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movb 4(%rcx), %al
+ movb %al, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 4(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %r8
+ lea 5(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit6):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movw 4(%rcx), %ax
+ movw %ax, 4(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 5(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %r8
+ lea 6(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit7):
+ movl (%rcx), %eax
+ movl %eax, (%rdx)
+ movl 3(%rcx), %eax
+ movl %eax, 3(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 6(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %r8
+ lea 7(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit9):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %eax
+ mov %eax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 8(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %r8
+ lea 9(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit10):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %eax
+ mov %eax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 9(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %r8
+ lea 10(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit11):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %eax
+ mov %eax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 10(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %r8
+ lea 11(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit12):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 8(%rcx), %eax
+ mov %eax, 8(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 11(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %r8
+ lea 12(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit13):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 5(%rcx), %rax
+ mov %rax, 5(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 12(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %r8
+ lea 13(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit14):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 6(%rcx), %rax
+ mov %rax, 6(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 13(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %r8
+ lea 14(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+ .p2align 4
+L(Exit15):
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+# else
+ mov %rdi, %rax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %r8
+ lea 15(%rdx), %rcx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# endif
+# endif
+ ret
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ ret
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%rcx)
+ movb %dl, 2(%rcx)
+ ret
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%rcx)
+ movb %dl, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%rcx)
+ movw %dx, 4(%rcx)
+ ret
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%rcx)
+ movl %edx, 3(%rcx)
+ ret
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rcx)
+ ret
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rcx)
+ movb %dl, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rcx)
+ movw %dx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rcx)
+ movl %edx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rcx)
+ movl %edx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rcx)
+ mov %rdx, 5(%rcx)
+ ret
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rcx)
+ mov %rdx, 6(%rcx)
+ ret
+
+ .p2align 4
+L(Fill15):
+ mov %rdx, (%rcx)
+ mov %rdx, 7(%rcx)
+ ret
+
+ .p2align 4
+L(Fill16):
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+ ret
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%r8), %r8
+L(FillFrom1To16Bytes):
+ test %r8, %r8
+ jz L(Fill0)
+ cmp $16, %r8
+ je L(Fill16)
+ cmp $8, %r8
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %r8
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %r8
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %r8
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %r8
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %r8
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %r8
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit1)
+
+ pxor %xmm0, %xmm0
+ mov %rdx, (%rcx)
+ mov %rdx, 8(%rcx)
+
+ lea 16(%rcx), %rcx
+
+ mov %rcx, %rdx
+ and $0xf, %rdx
+ sub %rdx, %rcx
+ add %rdx, %r8
+ xor %rdx, %rdx
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ movdqa %xmm0, 32(%rcx)
+ movdqa %xmm0, 48(%rcx)
+ lea 64(%rcx), %rcx
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm0, 16(%rcx)
+ lea 32(%rcx), %rcx
+ sub $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%rcx)
+ lea 16(%rcx), %rcx
+ jmp L(FillFrom1To16Bytes)
+
+ .p2align 4
+L(Exit0):
+ mov %rdx, %rax
+ ret
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %r8
+ je L(Exit9)
+ cmpb $0, 8(%rcx)
+ jz L(Exit9)
+ cmp $10, %r8
+ je L(Exit10)
+ cmpb $0, 9(%rcx)
+ jz L(Exit10)
+ cmp $11, %r8
+ je L(Exit11)
+ cmpb $0, 10(%rcx)
+ jz L(Exit11)
+ cmp $12, %r8
+ je L(Exit12)
+ cmpb $0, 11(%rcx)
+ jz L(Exit12)
+ cmp $13, %r8
+ je L(Exit13)
+ cmpb $0, 12(%rcx)
+ jz L(Exit13)
+ cmp $14, %r8
+ je L(Exit14)
+ cmpb $0, 13(%rcx)
+ jz L(Exit14)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+ mov 7(%rcx), %rax
+ mov %rax, 7(%rdx)
+# ifdef USE_AS_STPCPY
+ lea 14(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $1, %r8
+ je L(Exit1)
+ cmpb $0, (%rcx)
+ jz L(Exit1)
+ cmp $2, %r8
+ je L(Exit2)
+ cmpb $0, 1(%rcx)
+ jz L(Exit2)
+ cmp $3, %r8
+ je L(Exit3)
+ cmpb $0, 2(%rcx)
+ jz L(Exit3)
+ cmp $4, %r8
+ je L(Exit4)
+ cmpb $0, 3(%rcx)
+ jz L(Exit4)
+ cmp $5, %r8
+ je L(Exit5)
+ cmpb $0, 4(%rcx)
+ jz L(Exit5)
+ cmp $6, %r8
+ je L(Exit6)
+ cmpb $0, 5(%rcx)
+ jz L(Exit6)
+ cmp $7, %r8
+ je L(Exit7)
+ cmpb $0, 6(%rcx)
+ jz L(Exit7)
+ mov (%rcx), %rax
+ mov %rax, (%rdx)
+# ifdef USE_AS_STPCPY
+ lea 7(%rdx), %rax
+ cmpb $1, (%rax)
+ sbb $-1, %rax
+# else
+ mov %rdi, %rax
+# endif
+ ret
+
+# endif
+
+# ifdef USE_AS_STRNCPY
+
+L(StrncpyLeaveCase2OrCase3):
+ test %rax, %rax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ lea 64(%r8), %r8
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rax
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm4, -64(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm5, -48(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rax, %rax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rax
+ movaps %xmm6, -32(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+ jmp L(CopyFrom1To16BytesCase2)
+/*--------------------------------------------------*/
+L(StrncpyExit1Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %rsi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit2Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %rsi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit3Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %rsi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit4Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %rsi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit5Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %rsi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit6Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %rsi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit7Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %rsi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit8Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %rsi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit9Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %rsi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit10Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %rsi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit11Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %rsi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit12Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %rsi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit13Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %rsi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit14Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %rsi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit15Case2OrCase3):
+ movaps (%rdx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %rsi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx)
+ test %rax, %rax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 31(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 31+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit1)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit1):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $15, %xmm6
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 15(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 30(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 30+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit2)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit2):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $14, %xmm6
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 14(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 29(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 29+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit3)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit3):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $13, %xmm6
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 13(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 28(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 28+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit4)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit4):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $12, %xmm6
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 12(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 27(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 27+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit5)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit5):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $11, %xmm6
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 11(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 26(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 26+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit6)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit6):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $10, %xmm6
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 10(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 25(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 25+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit7)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit7):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $9, %xmm6
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 9(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 24(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 24+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit8)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit8):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $8, %xmm6
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 8(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 23(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 23+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit9)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit9):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $7, %xmm6
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 7(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 22(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 22+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit10)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit10):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $6, %xmm6
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 6(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 21(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 21+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit11)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit11):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $5, %xmm6
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 5(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 20(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 20+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit12)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit12):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $4, %xmm6
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 4(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 19(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 19+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit13)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit13):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $3, %xmm6
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 3(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 18(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 18+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit14)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit14):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $2, %xmm6
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 2(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %r8
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%rdx)
+ movaps 17(%rcx), %xmm2
+ lea 16(%rsi), %rsi
+ movaps %xmm2, %xmm3
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, 16(%rdx)
+ movaps 17+16(%rcx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%rdx)
+ lea 16(%rsi), %rsi
+ sub $16, %r8
+ jbe L(StrncpyExit15)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%rdx)
+ lea 16(%rsi), %rsi
+ lea -16(%r8), %r8
+
+L(StrncpyExit15):
+ movaps (%rdx, %rsi), %xmm6
+ psrldq $1, %xmm6
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%rdx, %rsi)
+ lea 1(%rsi), %rsi
+ jmp L(CopyFrom1To16BytesCase3)
+# endif
+
+END (STRCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 02fa8d0..381060f 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -1,5 +1,5 @@
-/* strcpy with SSSE3
- Copyright (C) 2009 Free Software Foundation, Inc.
+/* Multiple versions of strcpy
+ Copyright (C) 2009, 2011 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
@@ -29,30 +29,32 @@
#ifdef USE_AS_STPCPY
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __stpncpy_ssse3
-# define STRCPY_SSE2 __stpncpy_sse2
-# define __GI_STRCPY __GI_stpncpy
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
# else
-# define STRCPY_SSSE3 __stpcpy_ssse3
-# define STRCPY_SSE2 __stpcpy_sse2
-# define __GI_STRCPY __GI_stpcpy
-# define __GI___STRCPY __GI___stpcpy
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
# endif
#else
# ifdef USE_AS_STRNCPY
-# define STRCPY_SSSE3 __strncpy_ssse3
-# define STRCPY_SSE2 __strncpy_sse2
-# define __GI_STRCPY __GI_strncpy
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned
+# define __GI_STRCPY __GI_strncpy
# else
-# define STRCPY_SSSE3 __strcpy_ssse3
-# define STRCPY_SSE2 __strcpy_sse2
-# define __GI_STRCPY __GI_strcpy
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned
+# define __GI_STRCPY __GI_strcpy
# endif
#endif
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
@@ -62,1830 +64,16 @@ ENTRY(STRCPY)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq STRCPY_SSE2(%rip), %rax
+1: leaq STRCPY_SSE2_UNALIGNED(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 2f
+ leaq STRCPY_SSE2(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jz 2f
leaq STRCPY_SSSE3(%rip), %rax
2: ret
END(STRCPY)
- .section .text.ssse3,"ax",@progbits
-STRCPY_SSSE3:
- cfi_startproc
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to copy up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRNCPY
- test %rdx, %rdx
- jz LABEL(strncpy_exitz)
- mov %rdx, %r8
-#else
- xor %edx, %edx
-#endif
- mov %esi, %ecx
- and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
- and $15, %ecx
- mov %rdi, %rax /*store return parameter*/
-
-
- pxor %xmm0, %xmm0 /* clear %xmm0 */
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
- shr %cl, %edx /* get real bits left in edx*/
- test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
- jnz LABEL(less16bytes)
-
-#ifdef USE_AS_STRNCPY
- lea -16(%r8,%rcx), %r11
- cmp $0, %r11
- jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
-#endif
-
- mov %rcx, %r9
- or %edi, %ecx
- and $15, %ecx
- lea -16(%r9), %r10
- jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
-
- neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
-
- pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
- pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(less32bytes)
- /*
- * at least 16 byte available to fill destination rdi
- */
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(less32bytes_strncpy_truncation)
-#endif
- mov (%rsi, %r9), %rdx
- mov %rdx, (%rdi)
- mov 8(%rsi, %r9), %rdx
- mov %rdx, 8(%rdi)
-
- /*
- * so far destatination rdi may be aligned by 16, re-calculate rsi to jump
- * crossponding case
- * rcx is offset of rsi
- * rax is offset of rdi
- */
-
- and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
- mov %rax, %rdx /* rax store orignal rdi */
- xor %rdi, %rdx /* equal to and $15, %rdx */
-#ifdef USE_AS_STRNCPY
- add %rdx, %r8
-#endif
-
- add $16, %rdi /* next 16 bytes for rdi */
- sub %rdx, %r9
-
- lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
- mov %esi, %ecx /*store offset of rsi */
- and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
-
- and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
- jz LABEL(ashr_0)
-
- lea -16(%rcx), %r10
- mov %rcx, %r9
- neg %r10
- lea LABEL(unaligned_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
- /*
- * The following cases will be handled by ashr_0 & ashr_0_start
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * 0 0 0 ashr_0
- * n(1~15) n(1~15) 0 ashr_0_start
- *
- */
- .p2align 5
-LABEL(ashr_0):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
- movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
- add $16, %rsi
- add $16, %rdi
- pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
- pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
-
- test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
- jnz LABEL(aligned_16bytes)
-
-LABEL(ashr_0_loop):
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(aligned_exit)
-
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_aligned)
-#endif
- movdqa (%rsi, %rcx), %xmm1
- movdqa %xmm1, (%rdi, %rcx)
- add $16, %rcx
- pcmpeqb (%rsi, %rcx), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jz LABEL(ashr_0_loop)
-
- jmp LABEL(aligned_exit)
- .p2align 4
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_15):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_15_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $15, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_15_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_14):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_14_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $14, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_14_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_13):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_13_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $13, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_13_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_12):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_12_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $12, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_12_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_11):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_11_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $11, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_11_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_10):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_10_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $10, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_10_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_9):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_9_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $9, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_9_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_8):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_8_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $8, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_8_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_7):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- .p2align 4
-
-LABEL(ashr_7_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $7, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_7_use_ssse3)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_6):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_6_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $6, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_6_use_ssse3)
-
- /*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_5):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_5_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $5, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_5_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_4):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_4_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $4, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_4_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_3):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_3_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $3, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_3_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_2):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_2_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $2, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_2_use_ssse3)
-
-/*
- *
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
- *
- * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
- */
- .p2align 4
-LABEL(ashr_1):
- xor %ecx, %ecx /*clear ecx */
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- .p2align 4
-LABEL(ashr_1_use_ssse3):
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
-
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
-
- movdqa 16(%rsi, %rcx), %xmm3
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz LABEL(unaligned_exit)
-#ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe LABEL(strncpy_truncation_unaligned)
-#endif
- palignr $1, (%rsi, %rcx), %xmm3
- movdqa %xmm3, (%rdi, %rcx)
- add $16, %rcx
-
-#ifdef USE_AS_STRNCPY
- cmp %r10, %r8
- jbe LABEL(unaligned_exit)
-#endif
- jmp LABEL(ashr_1_use_ssse3)
-
- .p2align 4
-LABEL(less32bytes):
- xor %ecx, %ecx
-LABEL(unaligned_exit):
- add %r9, %rsi /* r9 stores original offset of rsi*/
- mov %rcx, %r9
- mov %r10, %rcx
- shl %cl, %edx /* after shl, calculate the exact number to be filled*/
- mov %r9, %rcx
- .p2align 4
-LABEL(aligned_exit):
- add %rcx, %rdi /*locate exact address for rdi */
-LABEL(less16bytes):
- add %rcx, %rsi /*locate exact address for rsi */
-LABEL(aligned_16bytes):
-#ifdef USE_AS_STRNCPY
- mov $1, %r9d
- lea -1(%r8), %rcx
- shl %cl, %r9d
- cmp $32, %r8
- ja LABEL(strncpy_tail)
- or %r9d, %edx
-LABEL(strncpy_tail):
-#endif
- bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(less32bytes_strncpy_truncation):
- xor %ecx, %ecx
-LABEL(strncpy_truncation_unaligned):
- add %r9, %rsi
-LABEL(strncpy_truncation_aligned):
- add %rcx, %rdi
- add %rcx, %rsi
- add $16, %r8
- lea -1(%r8), %rcx
- lea LABEL(tail_table)(%rip), %r11
- movslq (%r11, %rcx,4), %rcx
- lea (%r11, %rcx), %rcx
- jmp *%rcx
- .p2align 4
-LABEL(strncpy_exitz):
- mov %rdi, %rax
- ret
-#endif
-
-#ifdef USE_AS_STRNCPY
- .p2align 4
-LABEL(strncpy_fill_tail):
- mov %rax, %rdx
- movzx %cl, %rax
- mov %r8, %rcx
- add %rax, %rdi
- xor %eax, %eax
- shr $3, %ecx
- jz LABEL(strncpy_fill_less_8)
-
- rep stosq
-LABEL(strncpy_fill_less_8):
- mov %r8, %rcx
- and $7, %ecx
- jz LABEL(strncpy_fill_return)
-LABEL(strncpy_fill_less_7):
- sub $1, %ecx
- mov %al, (%rdi, %rcx)
- jnz LABEL(strncpy_fill_less_7)
-LABEL(strncpy_fill_return):
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rdx)
- sbb $-1, %rdx
-#endif
- mov %rdx, %rax
- ret
-#endif
- .p2align 4
-LABEL(tail_0):
- mov (%rsi), %cl
- mov %cl, (%rdi)
-#ifdef USE_AS_STPCPY
- mov %rdi, %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $1, %cl
- sub $1, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_1):
- mov (%rsi), %cx
- mov %cx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $2, %cl
- sub $2, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_2):
- mov (%rsi), %cx
- mov %cx, (%rdi)
- mov 1(%rsi), %cx
- mov %cx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $3, %cl
- sub $3, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_3):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $4, %cl
- sub $4, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_4):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 1(%rsi), %edx
- mov %edx, 1(%rdi)
-#ifdef USE_AS_STPCPY
- lea 4(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $5, %cl
- sub $5, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_5):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 2(%rsi), %edx
- mov %edx, 2(%rdi)
-#ifdef USE_AS_STPCPY
- lea 5(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $6, %cl
- sub $6, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_6):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov 3(%rsi), %edx
- mov %edx,3(%rdi)
-#ifdef USE_AS_STPCPY
- lea 6(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $7, %cl
- sub $7, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_7):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
-#ifdef USE_AS_STPCPY
- lea 7(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $8, %cl
- sub $8, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_8):
-
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %edx
- mov %edx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 8(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $9, %cl
- sub $9, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_9):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %edx
- mov %edx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 9(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $10, %cl
- sub $10, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_10):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %edx
- mov %edx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 10(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $11, %cl
- sub $11, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_11):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %edx
- mov %edx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 11(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $12, %cl
- sub $12, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_12):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 5(%rsi), %rcx
- mov %rcx, 5(%rdi)
-#ifdef USE_AS_STPCPY
- lea 12(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $13, %cl
- sub $13, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_13):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 6(%rsi), %rcx
- mov %rcx, 6(%rdi)
-#ifdef USE_AS_STPCPY
- lea 13(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $14, %cl
- sub $14, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_14):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 7(%rsi), %rcx
- mov %rcx, 7(%rdi)
-#ifdef USE_AS_STPCPY
- lea 14(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $15, %cl
- sub $15, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
-LABEL(tail_15):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
-#ifdef USE_AS_STPCPY
- lea 15(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $16, %cl
- sub $16, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_16):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cl
- mov %cl, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 16(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $17, %cl
- sub $17, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_17):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %cx
- mov %cx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 17(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $18, %cl
- sub $18, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_18):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %ecx
- mov %ecx,15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 18(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $19, %cl
- sub $19, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_19):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %ecx
- mov %ecx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 19(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $20, %cl
- sub $20, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_20):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 13(%rsi), %rcx
- mov %rcx, 13(%rdi)
-#ifdef USE_AS_STPCPY
- lea 20(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $21, %cl
- sub $21, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_21):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 14(%rsi), %rcx
- mov %rcx, 14(%rdi)
-#ifdef USE_AS_STPCPY
- lea 21(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $22, %cl
- sub $22, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_22):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 15(%rsi), %rcx
- mov %rcx, 15(%rdi)
-#ifdef USE_AS_STPCPY
- lea 22(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $23, %cl
- sub $23, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_23):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
-#ifdef USE_AS_STPCPY
- lea 23(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $24, %cl
- sub $24, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_24):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %edx
- mov %edx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 24(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $25, %cl
- sub $25, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_25):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %edx
- mov %edx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 25(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $26, %cl
- sub $26, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_26):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %edx
- mov %edx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 26(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $27, %cl
- sub $27, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_27):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %edx
- mov %edx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 27(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $28, %cl
- sub $28, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- .p2align 4
-LABEL(tail_28):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 21(%rsi), %rdx
- mov %rdx, 21(%rdi)
-#ifdef USE_AS_STPCPY
- lea 28(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $29, %cl
- sub $29, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
- .p2align 4
-LABEL(tail_29):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 22(%rsi), %rdx
- mov %rdx, 22(%rdi)
-#ifdef USE_AS_STPCPY
- lea 29(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $30, %cl
- sub $30, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
-
- ret
-
-
- .p2align 4
-LABEL(tail_30):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 23(%rsi), %rdx
- mov %rdx, 23(%rdi)
-#ifdef USE_AS_STPCPY
- lea 30(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $31, %cl
- sub $31, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
-
- .p2align 4
-LABEL(tail_31):
- mov (%rsi), %rcx
- mov %rcx, (%rdi)
- mov 8(%rsi), %rdx
- mov %rdx, 8(%rdi)
- mov 16(%rsi), %rcx
- mov %rcx, 16(%rdi)
- mov 24(%rsi), %rdx
- mov %rdx, 24(%rdi)
-#ifdef USE_AS_STPCPY
- lea 31(%rdi), %rax
-#endif
-#ifdef USE_AS_STRNCPY
- mov $32, %cl
- sub $32, %r8
- jnz LABEL(strncpy_fill_tail)
-#ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-#endif
-#endif
- ret
- cfi_endproc
- .size STRCPY_SSSE3, .-STRCPY_SSSE3
-
- .p2align 4
- .section .rodata.ssse3,"a",@progbits
-LABEL(tail_table):
- .int LABEL(tail_0) - LABEL(tail_table)
- .int LABEL(tail_1) - LABEL(tail_table)
- .int LABEL(tail_2) - LABEL(tail_table)
- .int LABEL(tail_3) - LABEL(tail_table)
- .int LABEL(tail_4) - LABEL(tail_table)
- .int LABEL(tail_5) - LABEL(tail_table)
- .int LABEL(tail_6) - LABEL(tail_table)
- .int LABEL(tail_7) - LABEL(tail_table)
- .int LABEL(tail_8) - LABEL(tail_table)
- .int LABEL(tail_9) - LABEL(tail_table)
- .int LABEL(tail_10) - LABEL(tail_table)
- .int LABEL(tail_11) - LABEL(tail_table)
- .int LABEL(tail_12) - LABEL(tail_table)
- .int LABEL(tail_13) - LABEL(tail_table)
- .int LABEL(tail_14) - LABEL(tail_table)
- .int LABEL(tail_15) - LABEL(tail_table)
- .int LABEL(tail_16) - LABEL(tail_table)
- .int LABEL(tail_17) - LABEL(tail_table)
- .int LABEL(tail_18) - LABEL(tail_table)
- .int LABEL(tail_19) - LABEL(tail_table)
- .int LABEL(tail_20) - LABEL(tail_table)
- .int LABEL(tail_21) - LABEL(tail_table)
- .int LABEL(tail_22) - LABEL(tail_table)
- .int LABEL(tail_23) - LABEL(tail_table)
- .int LABEL(tail_24) - LABEL(tail_table)
- .int LABEL(tail_25) - LABEL(tail_table)
- .int LABEL(tail_26) - LABEL(tail_table)
- .int LABEL(tail_27) - LABEL(tail_table)
- .int LABEL(tail_28) - LABEL(tail_table)
- .int LABEL(tail_29) - LABEL(tail_table)
- .int LABEL(tail_30) - LABEL(tail_table)
- .int LABEL(tail_31) - LABEL(tail_table)
-
- .p2align 4
-LABEL(unaligned_table):
- .int LABEL(ashr_0) - LABEL(unaligned_table)
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
-
# undef ENTRY
# define ENTRY(name) \
.type STRCPY_SSE2, @function; \
diff --git a/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
new file mode 100644
index 0000000..fcc23a7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2_unaligned
+#include "strcpy-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000..bf82ee4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d5495a116c6271c0ae8f6955b64b7b010b1b341a
commit d5495a116c6271c0ae8f6955b64b7b010b1b341a
Author: Ulrich Drepper <drepper@gmail.com>
Date: Fri Jun 24 14:59:17 2011 -0400
Work around limit in writev in 2.6.38+ kernels
diff --git a/ChangeLog b/ChangeLog
index 097ad20..8bf8eeb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-06-24 Ulrich Drepper <drepper@gmail.com>
+
+ [BZ #12874]
+ * sysdeps/unix/sysv/linux/Makefile (CFLAGS-tst-writev.c): Define.
+ * sysdeps/wordsize-64/tst-writev.c: Work around problem with 2.6.38+
+ kernels which artificially limit size of requests.
+
2011-06-22 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
diff --git a/NEWS b/NEWS
index edb356d..dd28004 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.15
* The following bugs are resolved with this release:
- 12885, 12907
+ 12874, 12885, 12907
* New program pldd to list loaded object of a process
Implemented by Ulrich Drepper.
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 05834e3..ebb3f5d 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -22,6 +22,7 @@ sysdep_routines += sysctl clone llseek umount umount2 readahead \
eventfd eventfd_read eventfd_write prlimit
CFLAGS-gethostid.c = -fexceptions
+CFLAGS-tst-writev.c += -DARTIFICIAL_LIMIT=0x7ffff000
sysdep_headers += sys/mount.h sys/acct.h sys/sysctl.h \
sys/klog.h sys/kdaemon.h \
diff --git a/sysdeps/wordsize-64/tst-writev.c b/sysdeps/wordsize-64/tst-writev.c
index 6e47886..015ad46 100644
--- a/sysdeps/wordsize-64/tst-writev.c
+++ b/sysdeps/wordsize-64/tst-writev.c
@@ -96,8 +96,14 @@ do_test (void)
if (ret != (ssize_t) EXPECTED)
{
- printf ("writev() return value: %zd != EXPECTED: %zd\n", ret, EXPECTED);
- return 1;
+#ifdef ARTIFICIAL_LIMIT
+ if (ret != (ssize_t) ARTIFICIAL_LIMIT)
+#endif
+ {
+ printf ("writev() return value: %zd != EXPECTED: %zd\n",
+ ret, EXPECTED);
+ return 1;
+ }
}
return 0;
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0b1cbaaef5ccc21baf2c35d4698fb28e82eab385
commit 0b1cbaaef5ccc21baf2c35d4698fb28e82eab385
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Fri Jun 24 14:15:32 2011 -0400
Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32
diff --git a/ChangeLog b/ChangeLog
index b4d6496..097ad20 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,28 @@
+2011-06-22 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+ strncpy-c strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+ strcpy-sse2 strncpy-sse2 stpcpy-sse2 stpncpy-sse2.
+ * sysdeps/i386/i686/multiarch/stpcpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/stpcpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy.S : New file.
+ * sysdeps/i386/i686/multiarch/strcpy-sse2.S : New file.
+ * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/strcpy.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-c.c: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy.S: New file.
+ * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+ Enable unaligned load optimization for Intel Core i3, i5 and i7
+ processors.
+ * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Unaligned_Load):
+ Define.
+ (index_Fast_Unaligned_Load): Define.
+ (HAS_FAST_UNALIGNED_LOAD): Define.
+
2011-06-23 Marek Polacek <mpolacek@redhat.com>
* nss/nss_db/db-open.c: Include <unistd.h> for read declaration.
diff --git a/NEWS b/NEWS
index 5a7ffc2..edb356d 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes. 2011-6-22
+GNU C Library NEWS -- history of user-visible changes. 2011-6-24
Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
See the end for copying conditions.
@@ -17,6 +17,9 @@ Version 2.15
* Add nss_db support back to glibc. No more dependency on Berkeley db
and support for initgroups lookups.
Implemented by Ulrich Drepper.
+
+* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
+ Contributed by HJ Lu.
Version 2.14
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 32286d8..4bae699 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -10,7 +10,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
- strlen-sse2 strlen-sse2-bsf
+ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+ strncpy-sse2 stpcpy-sse2 stpncpy-sse2
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000..46ca1b3
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000..d971c2d
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/stpcpy.S b/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000..b63d308
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000..37a703c
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000..14ed16f
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/stpncpy.S b/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000..ff89a89
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000..fad1ae2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2251 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \
+ CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets.
+ INDEX is a register contains the index into the jump table.
+ SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ call __i686.get_pc_thunk.cx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjuested ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edi
+ mov STR2(%esp), %esi
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+
+ mov %esi, %ecx
+# ifndef USE_AS_STPCPY
+ mov %edi, %eax /* save result */
+# endif
+ and $15, %ecx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%esi), %xmm1
+ add %ecx, %ebx
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%edi)
+
+ sub %ecx, %edi
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movdqu %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+ lea 128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+ add $64, %edi
+ add $64, %esi
+ movdqu %xmm4, -64(%edi)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%edi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+ movdqu %xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+# endif
+ movdqu %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%esi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%esi), %xmm0
+ movdqu %xmm1, (%edi)
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ jmp L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %edi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ sub %ecx, %ebx
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ movdqu %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+# endif
+ movdqu %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+# endif
+ movdqu %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %edi
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+ mov %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb %dh, (%edi)
+# ifdef USE_AS_STPCPY
+ lea (%edi), %eax
+# endif
+ sub $1, %ebx
+ lea 1(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ sub $2, %ebx
+ lea 2(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%edi)
+ movb %dh, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ sub $3, %ebx
+ lea 3(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ sub $4, %ebx
+ lea 4(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%esi), %ecx
+ movb %dh, 4(%edi)
+ movl %ecx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ sub $5, %ebx
+ lea 5(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ sub $6, %ebx
+ lea 6(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ sub $7, %ebx
+ lea 7(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ sub $8, %ebx
+ lea 8(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movlpd (%esi), %xmm0
+ movb %dh, 8(%edi)
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ sub $9, %ebx
+ lea 9(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ sub $10, %ebx
+ lea 10(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ sub $11, %ebx
+ lea 11(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ sub $12, %ebx
+ lea 12(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ sub $13, %ebx
+ lea 13(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ sub $14, %ebx
+ lea 14(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ sub $15, %ebx
+ lea 15(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ sub $16, %ebx
+ lea 16(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+ movb %dh, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ sub $17, %ebx
+ lea 17(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ sub $18, %ebx
+ lea 18(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ sub $19, %ebx
+ lea 19(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ sub $20, %ebx
+ lea 20(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dh, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ sub $21, %ebx
+ lea 21(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ sub $22, %ebx
+ lea 22(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ sub $23, %ebx
+ lea 23(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ sub $24, %ebx
+ lea 24(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %dh, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ sub $25, %ebx
+ lea 25(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ sub $26, %ebx
+ lea 26(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ sub $27, %ebx
+ lea 27(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ sub $28, %ebx
+ lea 28(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ sub $29, %ebx
+ lea 29(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ sub $30, %ebx
+ lea 30(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+
+ .p2align 4
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ sub $31, %ebx
+ lea 31(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ sub $32, %ebx
+ lea 32(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(StrncpyExit1):
+ movb (%esi), %dl
+ movb %dl, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit3):
+ movw (%esi), %cx
+ movb 2(%esi), %dl
+ movw %cx, (%edi)
+ movb %dl, 2(%edi)
+# ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ movl (%esi), %ecx
+ movb 4(%esi), %dl
+ movl %ecx, (%edi)
+ movb %dl, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+# ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+# ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ movlpd (%esi), %xmm0
+ movb 8(%esi), %dl
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+# ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+# ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+# ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+# ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%esi), %xmm0
+ movb 16(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movb %cl, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movb 20(%esi), %dl
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dl, 20(%edi)
+# ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movb 24(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %cl, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+# ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+# ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+# ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+# ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+# ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movb 32(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+ movb %cl, 32(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movl %edx, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%edi)
+ movb %dl, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%edi)
+ movw %dx, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movlpd %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 5(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 6(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%edi, %ecx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %edx, %edx
+ add $15, %ebx
+ add %ecx, %edi
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%edi)
+ add $16, %edi
+
+ mov %edi, %esi
+ and $0xf, %esi
+ sub %esi, %edi
+ add %esi, %ebx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ movdqa %xmm0, 32(%edi)
+ movdqa %xmm0, 48(%edi)
+ add $64, %edi
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ add $32, %edi
+ sub $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+ add $16, %ebx
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%edi)
+# ifdef USE_AS_STPCPY
+ lea 64(%edi), %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%edi)
+ lea 16(%edi, %ecx), %edi
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(ExitZero):
+ movl %edi, %eax
+ RETURN
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+
+L(ExitStrncpyTable):
+ .int JMPTBL(L(Exit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+# define RETURN1 ret
+
+ .text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ PUSH (%ebx)
+
+ mov %edx, %edi
+ lea 16(%ecx), %ebx
+ and $-16, %ebx
+ pxor %xmm0, %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+ pcmpeqb (%ebx), %xmm0
+ pmovmskb %xmm0, %eax
+ sub %ecx, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %ecx, %eax
+ lea 16(%ecx), %ecx
+ and $-16, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+ xor %ebx, %ebx
+
+ .p2align 4
+ movdqa (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movdqu %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm4
+ movdqu %xmm3, (%edx, %ebx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm1
+ movdqu %xmm4, (%edx, %ebx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm2
+ movdqu %xmm1, (%edx, %ebx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %ebx), %xmm3
+ movdqu %xmm2, (%edx, %ebx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $16, %ebx
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm3, (%edx, %ebx)
+ mov %ecx, %eax
+ lea 16(%ecx, %ebx), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps 32(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ add $64, %ecx
+ pminub %xmm7, %xmm3
+ add $64, %edx
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+L(Aligned64Loop_start):
+ movdqu %xmm4, -64(%edx)
+ movaps (%ecx), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edx)
+ movaps 16(%ecx), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%ecx), %xmm3
+ movdqu %xmm6, -32(%edx)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edx)
+ movaps 48(%ecx), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ add $64, %edx
+ add $64, %ecx
+ test %eax, %eax
+ jz L(Aligned64Loop_start)
+L(Aligned64Leave):
+ sub $0xa0, %ebx
+ pxor %xmm0, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%ebx), %ebx
+ jnz L(CopyFrom1To16Bytes)
+
+ movdqu %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%ebx), %ebx
+
+/*-----------------End of main part---------------------------*/
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ebx, %edx
+ add %ebx, %ecx
+
+ POP (%ebx)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ /* Exit 8 */
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ /* Exit 16 */
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edx, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail8):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail9):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail10):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail11):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail12):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 4(%ecx), %eax
+ movl %eax, 4(%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN1
+
+END (STRCPY)
+# endif
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
new file mode 100644
index 0000000..577d117
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -0,0 +1,4090 @@
+/* strcpy with SSSE3
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_ssse3
+# endif
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 8
+# define ENTRANCE PUSH(%ebx)
+# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx);
+# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi)
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN ret
+# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi)
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+/* In this code following instructions are used for copying:
+ movb - 1 byte
+ movw - 2 byte
+ movl - 4 byte
+ movlpd - 8 byte
+ movaps - 16 byte - requires 16 byte alignment
+ of sourse and destination adresses.
+ 16 byte alignment: adress is 32bit value,
+ right four bit of adress shall be 0.
+*/
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitTail0)
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ je L(ExitTail16)
+# endif
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+ PUSH (%esi)
+# ifdef USE_AS_STRNCPY
+ mov %ecx, %esi
+ and $0xf, %esi
+
+/* add 16 bytes ecx_shift to ebx */
+
+ add %esi, %ebx
+# endif
+ lea 16(%ecx), %esi
+/* Now:
+ esi = alignment_16(ecx) + ecx_shift + 16;
+ ecx_shift = ecx - alignment_16(ecx)
+*/
+ and $-16, %esi
+/* Now:
+ esi = alignment_16(ecx) + 16
+*/
+ pxor %xmm0, %xmm0
+ movlpd (%ecx), %xmm1
+ movlpd %xmm1, (%edx)
+/*
+ look if there is zero symbol in next 16 bytes of string
+ from esi to esi + 15 and form mask in xmm0
+*/
+ pcmpeqb (%esi), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+
+/* convert byte mask in xmm0 to bit mask */
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+/* esi = 16 - ecx_shift */
+
+/* eax = 0: there isn't end of string from position esi to esi+15 */
+
+# ifdef USE_AS_STRNCPY
+ sub $32, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+/* Now:
+ edx = edx + 16 = alignment_16(edx) + edx_shift + 16
+*/
+ and $-16, %edx
+
+/* Now: edx = alignment_16(edx) + 16 */
+
+ sub %edx, %eax
+
+/* Now: eax = edx_shift - 16 */
+
+# ifdef USE_AS_STRNCPY
+ add %eax, %esi
+ lea -1(%esi), %esi
+ and $1<<31, %esi
+ test %esi, %esi
+ jnz L(ContinueCopy)
+ lea 16(%ebx), %ebx
+
+L(ContinueCopy):
+# endif
+ sub %eax, %ecx
+/* Now:
+ case ecx_shift >= edx_shift:
+ ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16
+ case ecx_shift < edx_shift:
+ ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift)
+*/
+ mov %ecx, %eax
+ and $0xf, %eax
+/* Now:
+ case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift
+ case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift)
+ eax can be 0, 1, ..., 15
+*/
+ mov $0, %esi
+
+/* case: ecx_shift == edx_shift */
+
+ jz L(Align16Both)
+
+ cmp $8, %eax
+ jae L(ShlHigh8)
+ cmp $1, %eax
+ je L(Shl1)
+ cmp $2, %eax
+ je L(Shl2)
+ cmp $3, %eax
+ je L(Shl3)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $5, %eax
+ je L(Shl5)
+ cmp $6, %eax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %eax
+ je L(Shl9)
+ cmp $10, %eax
+ je L(Shl10)
+ cmp $11, %eax
+ je L(Shl11)
+ cmp $12, %eax
+ je L(Shl12)
+ cmp $13, %eax
+ je L(Shl13)
+ cmp $14, %eax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ lea 48+64(%ebx, %eax), %ebx
+# endif
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqb %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeaveCase2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+# ifdef USE_AS_STRNCPY
+ lea 48(%ebx), %ebx
+# endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ test %eax, %eax
+ lea 16(%esi), %esi
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+# ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+# endif
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%ecx), %xmm1
+ movaps 15(%ecx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 31(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -15(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%ecx), %xmm2
+ movaps 31(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $1, %xmm3, %xmm4
+ jnz L(Shl1Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave1)
+# endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %esi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%ecx), %xmm1
+ movaps 14(%ecx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 30(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -14(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%ecx), %xmm2
+ movaps 30(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $2, %xmm3, %xmm4
+ jnz L(Shl2Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave2)
+# endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %esi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%ecx), %xmm1
+ movaps 13(%ecx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 29(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -13(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%ecx), %xmm2
+ movaps 29(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $3, %xmm3, %xmm4
+ jnz L(Shl3Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave3)
+# endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %esi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $4, %xmm3, %xmm4
+ jnz L(Shl4Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave4)
+# endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %esi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%ecx), %xmm1
+ movaps 11(%ecx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 27(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -11(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%ecx), %xmm2
+ movaps 27(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $5, %xmm3, %xmm4
+ jnz L(Shl5Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave5)
+# endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %esi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%ecx), %xmm1
+ movaps 10(%ecx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 26(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -10(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%ecx), %xmm2
+ movaps 26(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $6, %xmm3, %xmm4
+ jnz L(Shl6Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave6)
+# endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %esi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%ecx), %xmm1
+ movaps 9(%ecx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 25(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -9(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%ecx), %xmm2
+ movaps 25(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $7, %xmm3, %xmm4
+ jnz L(Shl7Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave7)
+# endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %esi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $8, %xmm3, %xmm4
+ jnz L(Shl8Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave8)
+# endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %esi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%ecx), %xmm1
+ movaps 7(%ecx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 23(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -7(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%ecx), %xmm2
+ movaps 23(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $9, %xmm3, %xmm4
+ jnz L(Shl9Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave9)
+# endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %esi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%ecx), %xmm1
+ movaps 6(%ecx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 22(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -6(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%ecx), %xmm2
+ movaps 22(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $10, %xmm3, %xmm4
+ jnz L(Shl10Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave10)
+# endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %esi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%ecx), %xmm1
+ movaps 5(%ecx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 21(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -5(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%ecx), %xmm2
+ movaps 21(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $11, %xmm3, %xmm4
+ jnz L(Shl11Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave11)
+# endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %esi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $12, %xmm3, %xmm4
+ jnz L(Shl12Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave12)
+# endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %esi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%ecx), %xmm1
+ movaps 3(%ecx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 19(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -3(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%ecx), %xmm2
+ movaps 19(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $13, %xmm3, %xmm4
+ jnz L(Shl13Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave13)
+# endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %esi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%ecx), %xmm1
+ movaps 2(%ecx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 18(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -2(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%ecx), %xmm2
+ movaps 18(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $14, %xmm3, %xmm4
+ jnz L(Shl14Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave14)
+# endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %esi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%ecx), %xmm1
+ movaps 1(%ecx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+# endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ lea 17(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -1(%ecx), %ecx
+ sub %eax, %edx
+# ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+# endif
+ movaps -15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%ecx), %xmm2
+ movaps 17(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ test %eax, %eax
+ palignr $15, %xmm3, %xmm4
+ jnz L(Shl15Start)
+# ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave15)
+# endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movaps (%edx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %esi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %ebx
+# endif
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ lea (%esi, %edx), %esi
+ lea -9(%ebx), %edx
+ and $1<<7, %dh
+ or %al, %dh
+ test %dh, %dh
+ lea (%esi), %edx
+ POP (%esi)
+ jz L(ExitHighCase2)
+
+ cmp $1, %ebx
+ je L(Exit1)
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $2, %ebx
+ je L(Exit2)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $3, %ebx
+ je L(Exit3)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $4, %ebx
+ je L(Exit4)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $5, %ebx
+ je L(Exit5)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $6, %ebx
+ je L(Exit6)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $7, %ebx
+ je L(Exit7)
+ test $0x40, %al
+ jnz L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $9, %ebx
+ je L(Exit9)
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $10, %ebx
+ je L(Exit10)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $11, %ebx
+ je L(Exit11)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $12, %ebx
+ je L(Exit12)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $13, %ebx
+ je L(Exit13)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $14, %ebx
+ je L(Exit14)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $15, %ebx
+ je L(Exit15)
+ test $0x40, %ah
+ jnz L(Exit15)
+ jmp L(Exit16)
+
+ CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ cmp $16, %ebx
+ je L(Exit16)
+ cmp $8, %ebx
+ je L(Exit8)
+ jg L(More8Case3)
+ cmp $4, %ebx
+ je L(Exit4)
+ jg L(More4Case3)
+ cmp $2, %ebx
+ jl L(Exit1)
+ je L(Exit2)
+ jg L(Exit3)
+L(More8Case3): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Exit12)
+ jl L(Less12Case3)
+ cmp $14, %ebx
+ jl L(Exit13)
+ je L(Exit14)
+ jg L(Exit15)
+L(More4Case3): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Exit5)
+ je L(Exit6)
+ jg L(Exit7)
+L(Less12Case3): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Exit9)
+ je L(Exit10)
+ jg L(Exit11)
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edi, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+CFI_POP (%edi)
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%ecx)
+ movb %dl, 2(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%ecx)
+ movb %dl, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%ecx)
+ movw %dx, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%ecx)
+ movl %edx, 3(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%ecx)
+ movb %dl, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%ecx)
+ movw %dx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 5(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 6(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+ test %ebx, %ebx
+ jz L(Fill0)
+ cmp $16, %ebx
+ je L(Fill16)
+ cmp $8, %ebx
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %ebx
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %ebx
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %ebx
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ CFI_PUSH(%edi)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ POP (%edi)
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit1)
+
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+
+ lea 16(%ecx), %ecx
+
+ mov %ecx, %edx
+ and $0xf, %edx
+ sub %edx, %ecx
+ add %edx, %ebx
+ xor %edx, %edx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ movdqa %xmm0, 32(%ecx)
+ movdqa %xmm0, 48(%ecx)
+ lea 64(%ecx), %ecx
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ lea 32(%ecx), %ecx
+ sub $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+# endif
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+# ifdef USE_AS_STPCPY
+ lea (%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 1(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+# ifdef USE_AS_STPCPY
+ lea 2(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 3(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 4(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+# ifdef USE_AS_STPCPY
+ lea 5(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+# ifdef USE_AS_STPCPY
+ lea 6(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 8(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 9(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 10(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 11(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+# ifdef USE_AS_STPCPY
+ lea 12(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+# ifdef USE_AS_STPCPY
+ lea 13(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm0
+ movlpd %xmm0, 8(%edx)
+# ifdef USE_AS_STPCPY
+ lea 15(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+# ifdef USE_AS_STRNCPY
+L(StrncpyLeaveCase2OrCase3):
+ test %eax, %eax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase2)
+/* -------------------------------------------------- */
+L(StrncpyExit1Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $15, %xmm6
+ mov $15, %esi
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit2Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $14, %xmm6
+ mov $14, %esi
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit3Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $13, %xmm6
+ mov $13, %esi
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit4Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $12, %xmm6
+ mov $12, %esi
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit5Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $11, %xmm6
+ mov $11, %esi
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit6Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $10, %xmm6
+ mov $10, %esi
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit7Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $9, %xmm6
+ mov $9, %esi
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit8Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $8, %xmm6
+ mov $8, %esi
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit9Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $7, %xmm6
+ mov $7, %esi
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit10Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $6, %xmm6
+ mov $6, %esi
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit11Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $5, %xmm6
+ mov $5, %esi
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit12Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $4, %xmm6
+ mov $4, %esi
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit13Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $3, %xmm6
+ mov $3, %esi
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit14Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $2, %xmm6
+ mov $2, %esi
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyExit15Case2OrCase3):
+ movaps (%edx), %xmm6
+ psrldq $1, %xmm6
+ mov $1, %esi
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx)
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 31+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit1):
+ movaps (%edx, %esi), %xmm6
+ psrldq $15, %xmm6
+ palignr $1, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 15(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 30+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit2):
+ movaps (%edx, %esi), %xmm6
+ psrldq $14, %xmm6
+ palignr $2, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 14(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 29+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit3):
+ movaps (%edx, %esi), %xmm6
+ psrldq $13, %xmm6
+ palignr $3, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 13(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 28+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit4):
+ movaps (%edx, %esi), %xmm6
+ psrldq $12, %xmm6
+ palignr $4, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 12(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 27+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit5):
+ movaps (%edx, %esi), %xmm6
+ psrldq $11, %xmm6
+ palignr $5, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 11(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 26+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit6):
+ movaps (%edx, %esi), %xmm6
+ psrldq $10, %xmm6
+ palignr $6, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 10(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 25+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit7):
+ movaps (%edx, %esi), %xmm6
+ psrldq $9, %xmm6
+ palignr $7, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 9(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 24+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit8):
+ movaps (%edx, %esi), %xmm6
+ psrldq $8, %xmm6
+ palignr $8, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 8(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 23+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit9):
+ movaps (%edx, %esi), %xmm6
+ psrldq $7, %xmm6
+ palignr $9, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 7(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 22+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit10):
+ movaps (%edx, %esi), %xmm6
+ psrldq $6, %xmm6
+ palignr $10, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 6(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 21+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit11):
+ movaps (%edx, %esi), %xmm6
+ psrldq $5, %xmm6
+ palignr $11, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 5(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 20+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit12):
+ movaps (%edx, %esi), %xmm6
+ psrldq $4, %xmm6
+ palignr $12, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 4(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 19+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit13):
+ movaps (%edx, %esi), %xmm6
+ psrldq $3, %xmm6
+ palignr $13, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 3(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 18+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit14):
+ movaps (%edx, %esi), %xmm6
+ psrldq $2, %xmm6
+ palignr $14, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 2(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ lea 16(%esi), %esi
+ movaps %xmm2, %xmm3
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, 16(%edx)
+ movaps 17+16(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm2, %xmm1
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+
+L(StrncpyExit15):
+ movaps (%edx, %esi), %xmm6
+ psrldq $1, %xmm6
+ palignr $15, %xmm1, %xmm6
+ movaps %xmm6, (%edx, %esi)
+ lea 1(%esi), %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(ExitTail0):
+ movl %edx, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %ebx
+ je L(ExitTail9)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $10, %ebx
+ je L(ExitTail10)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $11, %ebx
+ je L(ExitTail11)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmp $12, %ebx
+ je L(ExitTail12)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmp $13, %ebx
+ je L(ExitTail13)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $14, %ebx
+ je L(ExitTail14)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+# endif
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $1, %ebx
+ je L(ExitTail1)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $2, %ebx
+ je L(ExitTail2)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $3, %ebx
+ je L(ExitTail3)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmp $4, %ebx
+ je L(ExitTail4)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmp $5, %ebx
+ je L(ExitTail5)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $6, %ebx
+ je L(ExitTail6)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $7, %ebx
+ je L(ExitTail7)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+# endif
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+# else
+ movl %edx, %eax
+# endif
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+END (STRCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strcpy.S b/sysdeps/i386/i686/multiarch/strcpy.S
new file mode 100644
index 0000000..d025a4f
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcpy.S
@@ -0,0 +1,154 @@
+/* Multiple versions of strcpy
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __stpncpy_ssse3
+# define STRCPY_SSE2 __stpncpy_sse2
+# define STRCPY_IA32 __stpncpy_ia32
+# define __GI_STRCPY __GI_stpncpy
+# define __GI___STRCPY __GI___stpncpy
+# else
+# define STRCPY_SSSE3 __stpcpy_ssse3
+# define STRCPY_SSE2 __stpcpy_sse2
+# define STRCPY_IA32 __stpcpy_ia32
+# define __GI_STRCPY __GI_stpcpy
+# define __GI___STRCPY __GI___stpcpy
+# endif
+#else
+# ifdef USE_AS_STRNCPY
+# define STRCPY_SSSE3 __strncpy_ssse3
+# define STRCPY_SSE2 __strncpy_sse2
+# define STRCPY_IA32 __strncpy_ia32
+# define __GI_STRCPY __GI_strncpy
+# else
+# define STRCPY_SSSE3 __strcpy_ssse3
+# define STRCPY_SSE2 __strcpy_sse2
+# define STRCPY_IA32 __strcpy_ia32
+# define __GI_STRCPY __GI_strcpy
+# endif
+#endif
+
+
+/* Define multiple versions only for the definition in libc. Don't
+ define multiple versions for strncpy in static library since we
+ need strncpy before the initialization happened. */
+#ifndef NOT_IN_libc
+
+# ifdef SHARED
+ .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
+ .globl __i686.get_pc_thunk.bx
+ .hidden __i686.get_pc_thunk.bx
+ .p2align 4
+ .type __i686.get_pc_thunk.bx,@function
+__i686.get_pc_thunk.bx:
+ movl (%esp), %ebx
+ ret
+
+ .text
+ENTRY(STRCPY)
+ .type STRCPY, @gnu_indirect_function
+ pushl %ebx
+ cfi_adjust_cfa_offset (4)
+ cfi_rel_offset (ebx, 0)
+ call __i686.get_pc_thunk.bx
+ addl $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
+ jne 1f
+ call __init_cpu_features
+1: leal STRCPY_IA32@GOTOFF(%ebx), %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal STRCPY_SSE2@GOTOFF(%ebx), %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+ jnz 2f
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+ jz 2f
+ leal STRCPY_SSSE3@GOTOFF(%ebx), %eax
+2: popl %ebx
+ cfi_adjust_cfa_offset (-4)
+ cfi_restore (ebx)
+ ret
+END(STRCPY)
+# else
+
+ENTRY(STRCPY)
+ .type STRCPY, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features
+ jne 1f
+ call __init_cpu_features
+1: leal STRCPY_IA32, %eax
+ testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+ jz 2f
+ leal STRCPY_SSE2, %eax
+ testl $bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+ jnz 2f
+ testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+ jz 2f
+ leal STRCPY_SSSE3, %eax
+2: ret
+END(STRCPY)
+
+# endif
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type STRCPY_IA32, @function; \
+ .align 16; \
+ STRCPY_IA32: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32
+
+# endif
+#endif
+
+#ifdef USE_AS_STPCPY
+# ifdef USE_AS_STRNCPY
+# include "../../stpncpy.S"
+# else
+# include "../../i586/stpcpy.S"
+# endif
+#else
+# ifndef USE_AS_STRNCPY
+# include "../../i586/strcpy.S"
+# endif
+#endif
diff --git a/sysdeps/i386/i686/multiarch/strncpy-c.c b/sysdeps/i386/i686/multiarch/strncpy-c.c
new file mode 100644
index 0000000..201e3f9
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncpy-c.c
@@ -0,0 +1,8 @@
+#define STRNCPY __strncpy_ia32
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32);
+#endif
+
+#include "string/strncpy.c"
diff --git a/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/sysdeps/i386/i686/multiarch/strncpy-sse2.S
new file mode 100644
index 0000000..bdd9923
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
new file mode 100644
index 0000000..bf82ee4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY __strncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/strncpy.S b/sysdeps/i386/i686/multiarch/strncpy.S
new file mode 100644
index 0000000..30a5bd2
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strncpy.S
@@ -0,0 +1,3 @@
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "strcpy.S"
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 809d105..81b2378 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -97,13 +97,18 @@ __init_cpu_features (void)
case 0x2c:
case 0x2e:
case 0x2f:
- /* Rep string instructions and copy backward are fast on
- Intel Core i3, i5 and i7. */
+ /* Rep string instructions, copy backward and unaligned loads
+ are fast on Intel Core i3, i5 and i7. */
#if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif
+#if index_Fast_Rep_String != index_Fast_Unaligned_Load
+# error index_Fast_Rep_String != index_Fast_Unaligned_Load
+#endif
__cpu_features.feature[index_Fast_Rep_String]
- |= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
+ |= (bit_Fast_Rep_String
+ | bit_Fast_Copy_Backward
+ | bit_Fast_Unaligned_Load);
break;
}
}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 6e409b8..addf5f3 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -20,6 +20,7 @@
#define bit_Fast_Copy_Backward (1 << 1)
#define bit_Slow_BSF (1 << 2)
#define bit_Prefer_SSE_for_memop (1 << 3)
+#define bit_Fast_Unaligned_Load (1 << 4)
#ifdef __ASSEMBLER__
@@ -39,6 +40,7 @@
# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
# define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE
# define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -112,6 +114,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_Fast_Copy_Backward FEATURE_INDEX_1
# define index_Slow_BSF FEATURE_INDEX_1
# define index_Prefer_SSE_for_memop FEATURE_INDEX_1
+# define index_Fast_Unaligned_Load FEATURE_INDEX_1
#define HAS_ARCH_FEATURE(idx, bit) \
((__get_cpu_features ()->feature[idx] & (bit)) != 0)
@@ -128,4 +131,7 @@ extern const struct cpu_features *__get_cpu_features (void)
#define HAS_PREFER_SSE_FOR_MEMOP \
HAS_ARCH_FEATURE (index_Prefer_SSE_for_memop, bit_Prefer_SSE_for_memop)
+#define HAS_FAST_UNALIGNED_LOAD \
+ HAS_ARCH_FEATURE (index_Fast_Unaligned_Load, bit_Fast_Unaligned_Load)
+
#endif /* __ASSEMBLER__ */
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=07f494a027b3adea1f3cd0cd4ca7c10949cdc476
commit 07f494a027b3adea1f3cd0cd4ca7c10949cdc476
Author: Marek Polacek <mpolacek@redhat.com>
Date: Fri Jun 24 02:57:27 2011 -0400
nss_db: Quash read implicit declaration warning
In the function `internal_setent' is used the function read, but the forward
declaration is missing. Thusly fixed.
diff --git a/ChangeLog b/ChangeLog
index ad7303f..b4d6496 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-06-23 Marek Polacek <mpolacek@redhat.com>
+
+ * nss/nss_db/db-open.c: Include <unistd.h> for read declaration.
+
2011-06-22 Ulrich Drepper <drepper@gmail.com>
[BZ #12907]
diff --git a/nss/nss_db/db-open.c b/nss/nss_db/db-open.c
index 36ce494..5a805cf 100644
--- a/nss/nss_db/db-open.c
+++ b/nss/nss_db/db-open.c
@@ -22,6 +22,7 @@
#include <dlfcn.h>
#include <stdlib.h>
#include <string.h>
+#include <unistd.h>
#include <sys/mman.h>
#include <not-cancel.h>
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 53 +
NEWS | 10 +-
nss/nss_db/db-open.c | 1 +
sysdeps/i386/i686/multiarch/Makefile | 4 +-
sysdeps/i386/i686/multiarch/stpcpy-sse2.S | 3 +
sysdeps/i386/i686/multiarch/stpcpy-ssse3.S | 3 +
sysdeps/{x86_64 => i386/i686}/multiarch/stpcpy.S | 0
sysdeps/i386/i686/multiarch/stpncpy-sse2.S | 4 +
sysdeps/i386/i686/multiarch/stpncpy-ssse3.S | 4 +
sysdeps/{x86_64 => i386/i686}/multiarch/stpncpy.S | 0
sysdeps/i386/i686/multiarch/strcpy-sse2.S | 2251 ++++++++++++
sysdeps/i386/i686/multiarch/strcpy-ssse3.S | 4090 +++++++++++++++++++++
sysdeps/i386/i686/multiarch/strcpy.S | 154 +
sysdeps/i386/i686/multiarch/strncpy-c.c | 8 +
sysdeps/i386/i686/multiarch/strncpy-sse2.S | 3 +
sysdeps/i386/i686/multiarch/strncpy-ssse3.S | 3 +
sysdeps/i386/i686/multiarch/strncpy.S | 3 +
sysdeps/unix/sysv/linux/Makefile | 1 +
sysdeps/wordsize-64/tst-writev.c | 10 +-
sysdeps/x86_64/multiarch/Makefile | 7 +-
sysdeps/x86_64/multiarch/init-arch.c | 11 +-
sysdeps/x86_64/multiarch/init-arch.h | 6 +
sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 3 +
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 +
sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S | 4 +
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 +
sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 1718 +++++++++
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3721 +++++++++++++++++++
sysdeps/x86_64/multiarch/strcpy.S | 1860 +---------
sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S | 3 +
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 +
31 files changed, 12102 insertions(+), 1846 deletions(-)
create mode 100644 sysdeps/i386/i686/multiarch/stpcpy-sse2.S
create mode 100644 sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
copy sysdeps/{x86_64 => i386/i686}/multiarch/stpcpy.S (100%)
create mode 100644 sysdeps/i386/i686/multiarch/stpncpy-sse2.S
create mode 100644 sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
copy sysdeps/{x86_64 => i386/i686}/multiarch/stpncpy.S (100%)
create mode 100644 sysdeps/i386/i686/multiarch/strcpy-sse2.S
create mode 100644 sysdeps/i386/i686/multiarch/strcpy-ssse3.S
create mode 100644 sysdeps/i386/i686/multiarch/strcpy.S
create mode 100644 sysdeps/i386/i686/multiarch/strncpy-c.c
create mode 100644 sysdeps/i386/i686/multiarch/strncpy-sse2.S
create mode 100644 sysdeps/i386/i686/multiarch/strncpy-ssse3.S
create mode 100644 sysdeps/i386/i686/multiarch/strncpy.S
create mode 100644 sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S
create mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
hooks/post-receive
--
GNU C Library master sources