This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.14-607-gbbe315e


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  bbe315ea364e86166bb985e2e605af029482a124 (commit)
       via  15db4de19dc0043c25ff6a205bfbc25a180b1c48 (commit)
      from  2b2596b1e94d9d51bd8febe81b759fa45a62e3cb (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bbe315ea364e86166bb985e2e605af029482a124

commit bbe315ea364e86166bb985e2e605af029482a124
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date:   Fri Dec 23 12:02:53 2011 -0500

    CL

diff --git a/ChangeLog b/ChangeLog
index 2eed115..37e70e7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,8 +1,8 @@
 2011-12-23  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
 
 	[BZ #13540]
-	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix bug.
-	Fix overrun in destination buffer.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix overrun in
+	destination buffer.
 	* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise.
 
 2011-12-23  Marek Polacek  <polacek@redhat.com>

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=15db4de19dc0043c25ff6a205bfbc25a180b1c48

commit 15db4de19dc0043c25ff6a205bfbc25a180b1c48
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date:   Fri Dec 23 12:02:15 2011 -0500

    Fix overrun in destination buffer

diff --git a/ChangeLog b/ChangeLog
index a883f93..2eed115 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2011-12-23  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	[BZ #13540]
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Fix bug.
+	Fix overrun in destination buffer.
+	* sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise.
+
 2011-12-23  Marek Polacek  <polacek@redhat.com>
 
 	* elf/dl-addr.c (determine_info): Add inline keyword.
@@ -13,6 +20,7 @@
 
 2011-12-23  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
 
+	[BZ #13540]
 	* sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: Fix wrong copying
 	processing for last bytes.
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index c4ec54c..b104765 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -29,6 +29,7 @@
 
 	.section .text.ssse3,"ax",@progbits
 ENTRY (STRCPY)
+
 	mov	%rsi, %rcx
 #  ifdef USE_AS_STRNCPY
 	mov	%rdx, %r8
@@ -39,7 +40,7 @@ ENTRY (STRCPY)
 	jz	L(Exit0)
 	cmp	$8, %r8
 	jbe	L(StrncpyExit8Bytes)
-#  endif
+# endif
 	cmpb	$0, (%rcx)
 	jz	L(Exit1)
 	cmpb	$0, 1(%rcx)
@@ -56,10 +57,10 @@ ENTRY (STRCPY)
 	jz	L(Exit7)
 	cmpb	$0, 7(%rcx)
 	jz	L(Exit8)
-#  ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	jb	L(StrncpyExit15Bytes)
-#  endif
+# endif
 	cmpb	$0, 8(%rcx)
 	jz	L(Exit9)
 	cmpb	$0, 9(%rcx)
@@ -74,10 +75,10 @@ ENTRY (STRCPY)
 	jz	L(Exit14)
 	cmpb	$0, 14(%rcx)
 	jz	L(Exit15)
-#  ifdef USE_AS_STRNCPY
+# ifdef USE_AS_STRNCPY
 	cmp	$16, %r8
 	je	L(Exit16)
-#  endif
+# endif
 	cmpb	$0, 15(%rcx)
 	jz	L(Exit16)
 # endif
@@ -87,25 +88,15 @@ ENTRY (STRCPY)
 	sub	$16, %r8
 	and	$0xf, %rsi
 
-/* add 16 bytes rcx_shift to r8 */
+/* add 16 bytes rcx_offset to r8 */
+
 	add	%rsi, %r8
 # endif
 	lea	16(%rcx), %rsi
-/* Now:
-	rsi	= alignment_16(rcx) + rcx_shift + 16;
-	rcx_shift = rcx - alignment_16(rcx)
-*/
 	and	$-16, %rsi
-/* Now:
-	rsi	= alignment_16(rcx) + 16
-*/
 	pxor	%xmm0, %xmm0
 	mov	(%rcx), %r9
 	mov	%r9, (%rdx)
-/*
-	look	if there is zero symbol in next 16 bytes of string
-	from	rsi to rsi + 15 and form mask in xmm0
-*/
 	pcmpeqb	(%rsi), %xmm0
 	mov	8(%rcx), %r9
 	mov	%r9, 8(%rdx)
@@ -115,10 +106,6 @@ ENTRY (STRCPY)
 	pmovmskb %xmm0, %rax
 	sub	%rcx, %rsi
 
-/* rsi = 16 - rcx_shift */
-
-/* rax = 0: there isn't end of string from position rsi to rsi+15 */
-
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(CopyFrom1To16BytesCase2OrCase3)
@@ -128,17 +115,9 @@ ENTRY (STRCPY)
 
 	mov	%rdx, %rax
 	lea	16(%rdx), %rdx
-/* Now:
-	rdx	= rdx + 16 = alignment_16(rdx) + rdx_shift + 16
-*/
 	and	$-16, %rdx
-
-/* Now: rdx = alignment_16(rdx) + 16 */
-
 	sub	%rdx, %rax
 
-/* Now: rax = rdx_shift - 16 */
-
 # ifdef USE_AS_STRNCPY
 	add	%rax, %rsi
 	lea	-1(%rsi), %rsi
@@ -150,22 +129,11 @@ ENTRY (STRCPY)
 L(ContinueCopy):
 # endif
 	sub	%rax, %rcx
-/* Now:
-	case	rcx_shift >= rdx_shift:
-	rcx	= alignment_16(rcx) + (rcx_shift  - rdx_shift) + 16
-	case	rcx_shift < rdx_shift:
-	rcx	= alignment_16(rcx) + (16 + rcx_shift  - rdx_shift)
-*/
 	mov	%rcx, %rax
 	and	$0xf, %rax
-/* Now:
-	case	rcx_shift >= rdx_shift: rax = rcx_shift  - rdx_shift
-	case	rcx_shift < rdx_shift: rax = (16 + rcx_shift  - rdx_shift)
-	rax	can be 0, 1,	..., 15
-*/
 	mov	$0, %rsi
 
-/* case: rcx_shift == rdx_shift */
+/* case: rcx_offset == rdx_offset */
 
 	jz	L(Align16Both)
 
@@ -282,10 +250,11 @@ L(Align16Both):
 	sub	%rcx, %rax
 	sub	%rax, %rdx
 # ifdef USE_AS_STRNCPY
-	lea	48+64(%r8, %rax), %r8
+	lea	112(%r8, %rax), %r8
 # endif
 	mov	$-0x40, %rsi
 
+	.p2align 4
 L(Aligned64Loop):
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
@@ -366,7 +335,6 @@ L(Shl1Start):
 	jnz	L(Shl1LoopExit)
 
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 
@@ -374,7 +342,7 @@ L(Shl1Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit1Case2OrCase3)
@@ -382,10 +350,9 @@ L(Shl1Start):
 	test	%rax, %rax
 	jnz	L(Shl1LoopExit)
 
-	palignr	$1, %xmm1, %xmm2
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -400,7 +367,6 @@ L(Shl1Start):
 	jnz	L(Shl1LoopExit)
 
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 
@@ -408,7 +374,6 @@ L(Shl1Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit1Case2OrCase3)
@@ -416,8 +381,7 @@ L(Shl1Start):
 	test	%rax, %rax
 	jnz	L(Shl1LoopExit)
 
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	31(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -432,6 +396,8 @@ L(Shl1Start):
 # endif
 	movaps	-1(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl1LoopStart):
 	movaps	15(%rcx), %xmm2
 	movaps	31(%rcx), %xmm3
@@ -465,11 +431,9 @@ L(Shl1LoopStart):
 	jmp	L(Shl1LoopStart)
 
 L(Shl1LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$15, %xmm6
+	movdqu	-1(%rcx), %xmm1
 	mov	$15, %rsi
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -488,7 +452,6 @@ L(Shl2Start):
 	jnz	L(Shl2LoopExit)
 
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 
@@ -496,7 +459,7 @@ L(Shl2Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit2Case2OrCase3)
@@ -504,10 +467,9 @@ L(Shl2Start):
 	test	%rax, %rax
 	jnz	L(Shl2LoopExit)
 
-	palignr	$2, %xmm1, %xmm2
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -522,7 +484,6 @@ L(Shl2Start):
 	jnz	L(Shl2LoopExit)
 
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 
@@ -530,7 +491,6 @@ L(Shl2Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit2Case2OrCase3)
@@ -538,8 +498,7 @@ L(Shl2Start):
 	test	%rax, %rax
 	jnz	L(Shl2LoopExit)
 
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	30(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -554,6 +513,8 @@ L(Shl2Start):
 # endif
 	movaps	-2(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl2LoopStart):
 	movaps	14(%rcx), %xmm2
 	movaps	30(%rcx), %xmm3
@@ -587,11 +548,9 @@ L(Shl2LoopStart):
 	jmp	L(Shl2LoopStart)
 
 L(Shl2LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$14, %xmm6
+	movdqu	-2(%rcx), %xmm1
 	mov	$14, %rsi
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -610,7 +569,6 @@ L(Shl3Start):
 	jnz	L(Shl3LoopExit)
 
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 
@@ -618,7 +576,7 @@ L(Shl3Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit3Case2OrCase3)
@@ -626,10 +584,9 @@ L(Shl3Start):
 	test	%rax, %rax
 	jnz	L(Shl3LoopExit)
 
-	palignr	$3, %xmm1, %xmm2
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -644,7 +601,6 @@ L(Shl3Start):
 	jnz	L(Shl3LoopExit)
 
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 
@@ -652,7 +608,6 @@ L(Shl3Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit3Case2OrCase3)
@@ -660,8 +615,7 @@ L(Shl3Start):
 	test	%rax, %rax
 	jnz	L(Shl3LoopExit)
 
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	29(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -676,6 +630,8 @@ L(Shl3Start):
 # endif
 	movaps	-3(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl3LoopStart):
 	movaps	13(%rcx), %xmm2
 	movaps	29(%rcx), %xmm3
@@ -709,11 +665,9 @@ L(Shl3LoopStart):
 	jmp	L(Shl3LoopStart)
 
 L(Shl3LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$13, %xmm6
+	movdqu	-3(%rcx), %xmm1
 	mov	$13, %rsi
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -3(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -732,7 +686,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -740,7 +693,7 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit4Case2OrCase3)
@@ -748,10 +701,9 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -766,7 +718,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -774,7 +725,6 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit4Case2OrCase3)
@@ -782,8 +732,7 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -798,6 +747,8 @@ L(Shl4Start):
 # endif
 	movaps	-4(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl4LoopStart):
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
@@ -831,11 +782,9 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -4(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -854,7 +803,6 @@ L(Shl5Start):
 	jnz	L(Shl5LoopExit)
 
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 
@@ -862,7 +810,7 @@ L(Shl5Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit5Case2OrCase3)
@@ -870,10 +818,9 @@ L(Shl5Start):
 	test	%rax, %rax
 	jnz	L(Shl5LoopExit)
 
-	palignr	$5, %xmm1, %xmm2
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -888,7 +835,6 @@ L(Shl5Start):
 	jnz	L(Shl5LoopExit)
 
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 
@@ -896,7 +842,6 @@ L(Shl5Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit5Case2OrCase3)
@@ -904,8 +849,7 @@ L(Shl5Start):
 	test	%rax, %rax
 	jnz	L(Shl5LoopExit)
 
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	27(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -920,6 +864,8 @@ L(Shl5Start):
 # endif
 	movaps	-5(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl5LoopStart):
 	movaps	11(%rcx), %xmm2
 	movaps	27(%rcx), %xmm3
@@ -953,11 +899,9 @@ L(Shl5LoopStart):
 	jmp	L(Shl5LoopStart)
 
 L(Shl5LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$11, %xmm6
+	movdqu	-5(%rcx), %xmm1
 	mov	$11, %rsi
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -5(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -976,7 +920,6 @@ L(Shl6Start):
 	jnz	L(Shl6LoopExit)
 
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 
@@ -984,7 +927,7 @@ L(Shl6Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit6Case2OrCase3)
@@ -992,10 +935,9 @@ L(Shl6Start):
 	test	%rax, %rax
 	jnz	L(Shl6LoopExit)
 
-	palignr	$6, %xmm1, %xmm2
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1010,7 +952,6 @@ L(Shl6Start):
 	jnz	L(Shl6LoopExit)
 
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 
@@ -1018,7 +959,6 @@ L(Shl6Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit6Case2OrCase3)
@@ -1026,8 +966,7 @@ L(Shl6Start):
 	test	%rax, %rax
 	jnz	L(Shl6LoopExit)
 
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	26(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1042,6 +981,8 @@ L(Shl6Start):
 # endif
 	movaps	-6(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl6LoopStart):
 	movaps	10(%rcx), %xmm2
 	movaps	26(%rcx), %xmm3
@@ -1075,11 +1016,11 @@ L(Shl6LoopStart):
 	jmp	L(Shl6LoopStart)
 
 L(Shl6LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$10, %xmm6
+	mov	(%rcx), %r9
+	mov	6(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 6(%rdx)
 	mov	$10, %rsi
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1098,7 +1039,6 @@ L(Shl7Start):
 	jnz	L(Shl7LoopExit)
 
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 
@@ -1106,7 +1046,7 @@ L(Shl7Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit7Case2OrCase3)
@@ -1114,10 +1054,9 @@ L(Shl7Start):
 	test	%rax, %rax
 	jnz	L(Shl7LoopExit)
 
-	palignr	$7, %xmm1, %xmm2
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1132,7 +1071,6 @@ L(Shl7Start):
 	jnz	L(Shl7LoopExit)
 
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 
@@ -1140,7 +1078,6 @@ L(Shl7Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit7Case2OrCase3)
@@ -1148,8 +1085,7 @@ L(Shl7Start):
 	test	%rax, %rax
 	jnz	L(Shl7LoopExit)
 
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	25(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1164,6 +1100,8 @@ L(Shl7Start):
 # endif
 	movaps	-7(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl7LoopStart):
 	movaps	9(%rcx), %xmm2
 	movaps	25(%rcx), %xmm3
@@ -1197,11 +1135,11 @@ L(Shl7LoopStart):
 	jmp	L(Shl7LoopStart)
 
 L(Shl7LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$9, %xmm6
+	mov	(%rcx), %r9
+	mov	5(%rcx), %esi
+	mov	%r9, (%rdx)
+	mov	%esi, 5(%rdx)
 	mov	$9, %rsi
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1220,7 +1158,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -1228,7 +1165,7 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit8Case2OrCase3)
@@ -1236,10 +1173,9 @@ L(Shl8Start):
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1254,7 +1190,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -1262,7 +1197,6 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit8Case2OrCase3)
@@ -1270,8 +1204,7 @@ L(Shl8Start):
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1286,6 +1219,8 @@ L(Shl8Start):
 # endif
 	movaps	-8(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl8LoopStart):
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
@@ -1319,11 +1254,9 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1342,7 +1275,6 @@ L(Shl9Start):
 	jnz	L(Shl9LoopExit)
 
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 
@@ -1350,7 +1282,7 @@ L(Shl9Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit9Case2OrCase3)
@@ -1358,10 +1290,9 @@ L(Shl9Start):
 	test	%rax, %rax
 	jnz	L(Shl9LoopExit)
 
-	palignr	$9, %xmm1, %xmm2
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1376,7 +1307,6 @@ L(Shl9Start):
 	jnz	L(Shl9LoopExit)
 
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 
@@ -1384,7 +1314,6 @@ L(Shl9Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit9Case2OrCase3)
@@ -1392,8 +1321,7 @@ L(Shl9Start):
 	test	%rax, %rax
 	jnz	L(Shl9LoopExit)
 
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	23(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1408,6 +1336,8 @@ L(Shl9Start):
 # endif
 	movaps	-9(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl9LoopStart):
 	movaps	7(%rcx), %xmm2
 	movaps	23(%rcx), %xmm3
@@ -1441,11 +1371,9 @@ L(Shl9LoopStart):
 	jmp	L(Shl9LoopStart)
 
 L(Shl9LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$7, %xmm6
+	mov	-1(%rcx), %r9
 	mov	$7, %rsi
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1464,7 +1392,6 @@ L(Shl10Start):
 	jnz	L(Shl10LoopExit)
 
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 
@@ -1472,7 +1399,7 @@ L(Shl10Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit10Case2OrCase3)
@@ -1480,10 +1407,9 @@ L(Shl10Start):
 	test	%rax, %rax
 	jnz	L(Shl10LoopExit)
 
-	palignr	$10, %xmm1, %xmm2
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1498,7 +1424,6 @@ L(Shl10Start):
 	jnz	L(Shl10LoopExit)
 
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 
@@ -1506,7 +1431,6 @@ L(Shl10Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit10Case2OrCase3)
@@ -1514,8 +1438,7 @@ L(Shl10Start):
 	test	%rax, %rax
 	jnz	L(Shl10LoopExit)
 
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	22(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1530,6 +1453,8 @@ L(Shl10Start):
 # endif
 	movaps	-10(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl10LoopStart):
 	movaps	6(%rcx), %xmm2
 	movaps	22(%rcx), %xmm3
@@ -1563,11 +1488,9 @@ L(Shl10LoopStart):
 	jmp	L(Shl10LoopStart)
 
 L(Shl10LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$6, %xmm6
+	mov	-2(%rcx), %r9
 	mov	$6, %rsi
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1586,7 +1509,6 @@ L(Shl11Start):
 	jnz	L(Shl11LoopExit)
 
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 
@@ -1594,7 +1516,7 @@ L(Shl11Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit11Case2OrCase3)
@@ -1602,10 +1524,9 @@ L(Shl11Start):
 	test	%rax, %rax
 	jnz	L(Shl11LoopExit)
 
-	palignr	$11, %xmm1, %xmm2
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1620,7 +1541,6 @@ L(Shl11Start):
 	jnz	L(Shl11LoopExit)
 
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 
@@ -1628,7 +1548,6 @@ L(Shl11Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit11Case2OrCase3)
@@ -1636,8 +1555,7 @@ L(Shl11Start):
 	test	%rax, %rax
 	jnz	L(Shl11LoopExit)
 
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	21(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1652,6 +1570,8 @@ L(Shl11Start):
 # endif
 	movaps	-11(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl11LoopStart):
 	movaps	5(%rcx), %xmm2
 	movaps	21(%rcx), %xmm3
@@ -1685,11 +1605,9 @@ L(Shl11LoopStart):
 	jmp	L(Shl11LoopStart)
 
 L(Shl11LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$5, %xmm6
+	mov	-3(%rcx), %r9
 	mov	$5, %rsi
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -3(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1708,7 +1626,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -1716,7 +1633,7 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit12Case2OrCase3)
@@ -1724,10 +1641,9 @@ L(Shl12Start):
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1742,7 +1658,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -1750,7 +1665,6 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit12Case2OrCase3)
@@ -1758,8 +1672,7 @@ L(Shl12Start):
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1774,6 +1687,8 @@ L(Shl12Start):
 # endif
 	movaps	-12(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl12LoopStart):
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
@@ -1807,11 +1722,9 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1830,7 +1743,6 @@ L(Shl13Start):
 	jnz	L(Shl13LoopExit)
 
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 
@@ -1838,7 +1750,7 @@ L(Shl13Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit13Case2OrCase3)
@@ -1846,10 +1758,9 @@ L(Shl13Start):
 	test	%rax, %rax
 	jnz	L(Shl13LoopExit)
 
-	palignr	$13, %xmm1, %xmm2
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1864,7 +1775,6 @@ L(Shl13Start):
 	jnz	L(Shl13LoopExit)
 
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 
@@ -1872,7 +1782,6 @@ L(Shl13Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit13Case2OrCase3)
@@ -1880,8 +1789,7 @@ L(Shl13Start):
 	test	%rax, %rax
 	jnz	L(Shl13LoopExit)
 
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	19(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -1896,6 +1804,8 @@ L(Shl13Start):
 # endif
 	movaps	-13(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl13LoopStart):
 	movaps	3(%rcx), %xmm2
 	movaps	19(%rcx), %xmm3
@@ -1929,11 +1839,9 @@ L(Shl13LoopStart):
 	jmp	L(Shl13LoopStart)
 
 L(Shl13LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$3, %xmm6
+	mov	-1(%rcx), %r9d
 	mov	$3, %rsi
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -1(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -1952,7 +1860,6 @@ L(Shl14Start):
 	jnz	L(Shl14LoopExit)
 
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 
@@ -1960,7 +1867,7 @@ L(Shl14Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit14Case2OrCase3)
@@ -1968,10 +1875,9 @@ L(Shl14Start):
 	test	%rax, %rax
 	jnz	L(Shl14LoopExit)
 
-	palignr	$14, %xmm1, %xmm2
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -1986,7 +1892,6 @@ L(Shl14Start):
 	jnz	L(Shl14LoopExit)
 
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 
@@ -1994,7 +1899,6 @@ L(Shl14Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit14Case2OrCase3)
@@ -2002,8 +1906,7 @@ L(Shl14Start):
 	test	%rax, %rax
 	jnz	L(Shl14LoopExit)
 
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	18(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -2018,6 +1921,8 @@ L(Shl14Start):
 # endif
 	movaps	-14(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl14LoopStart):
 	movaps	2(%rcx), %xmm2
 	movaps	18(%rcx), %xmm3
@@ -2051,11 +1956,9 @@ L(Shl14LoopStart):
 	jmp	L(Shl14LoopStart)
 
 L(Shl14LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$2, %xmm6
+	mov	-2(%rcx), %r9d
 	mov	$2, %rsi
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -2(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -2074,7 +1977,6 @@ L(Shl15Start):
 	jnz	L(Shl15LoopExit)
 
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 
@@ -2082,7 +1984,7 @@ L(Shl15Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit15Case2OrCase3)
@@ -2090,10 +1992,9 @@ L(Shl15Start):
 	test	%rax, %rax
 	jnz	L(Shl15LoopExit)
 
-	palignr	$15, %xmm1, %xmm2
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqb	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -2108,7 +2009,6 @@ L(Shl15Start):
 	jnz	L(Shl15LoopExit)
 
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 
@@ -2116,7 +2016,6 @@ L(Shl15Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 # ifdef USE_AS_STRNCPY
 	sub	$16, %r8
 	jbe	L(StrncpyExit15Case2OrCase3)
@@ -2124,8 +2023,7 @@ L(Shl15Start):
 	test	%rax, %rax
 	jnz	L(Shl15LoopExit)
 
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	17(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -2140,6 +2038,8 @@ L(Shl15Start):
 # endif
 	movaps	-15(%rcx), %xmm1
 
+/* 64 bytes loop */
+	.p2align 4
 L(Shl15LoopStart):
 	movaps	1(%rcx), %xmm2
 	movaps	17(%rcx), %xmm3
@@ -2173,16 +2073,15 @@ L(Shl15LoopStart):
 	jmp	L(Shl15LoopStart)
 
 L(Shl15LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$1, %xmm6
+	mov	-3(%rcx), %r9d
 	mov	$1, %rsi
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -3(%rdx)
 # ifdef USE_AS_STRCAT
 	jmp	L(CopyFrom1To16Bytes)
 # endif
 
 # ifndef USE_AS_STRCAT
+
 	.p2align 4
 L(CopyFrom1To16Bytes):
 #  ifdef USE_AS_STRNCPY
@@ -2463,7 +2362,7 @@ L(Exit4):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#  endif
+#   endif
 #  endif
 	ret
 
@@ -2485,7 +2384,7 @@ L(Exit5):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2507,7 +2406,7 @@ L(Exit6):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2617,7 +2516,7 @@ L(Exit12):
 #   ifdef USE_AS_STPCPY
 	cmpb	$1, (%rax)
 	sbb	$-1, %rax
-#   endif
+#  endif
 #  endif
 	ret
 
@@ -2955,11 +2854,10 @@ L(StrncpyExit8Bytes):
 	ret
 
 #  endif
-
 # endif
 
 # ifdef USE_AS_STRNCPY
-
+	.p2align 4
 L(StrncpyLeaveCase2OrCase3):
 	test	%rax, %rax
 	jnz	L(Aligned64LeaveCase2)
@@ -3014,710 +2912,639 @@ L(Aligned64LeaveCase2):
 	lea	-16(%r8), %r8
 	jmp	L(CopyFrom1To16BytesCase2)
 /*--------------------------------------------------*/
+	.p2align 4
 L(StrncpyExit1Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$15, %xmm6
+	movdqu	-1(%rcx), %xmm0
+	movdqu	%xmm0, -1(%rdx)
 	mov	$15, %rsi
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit2Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$14, %xmm6
+	movdqu	-2(%rcx), %xmm0
+	movdqu	%xmm0, -2(%rdx)
 	mov	$14, %rsi
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit3Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$13, %xmm6
+	movdqu	-3(%rcx), %xmm0
+	movdqu	%xmm0, -3(%rdx)
 	mov	$13, %rsi
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit4Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm0
+	movdqu	%xmm0, -4(%rdx)
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit5Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$11, %xmm6
+	movdqu	-5(%rcx), %xmm0
+	movdqu	%xmm0, -5(%rdx)
 	mov	$11, %rsi
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit6Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$10, %xmm6
-	mov	$10, %rsi
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	(%rcx), %rsi
+	mov	6(%rcx), %r9d
+	mov	%r9d, 6(%rdx)
+	mov	%rsi, (%rdx)
 	test	%rax, %rax
+	mov	$10, %rsi
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit7Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$9, %xmm6
-	mov	$9, %rsi
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	(%rcx), %rsi
+	mov	5(%rcx), %r9d
+	mov	%r9d, 5(%rdx)
+	mov	%rsi, (%rdx)
 	test	%rax, %rax
+	mov	$9, %rsi
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit8Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit9Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$7, %xmm6
+	mov	-1(%rcx), %r9
 	mov	$7, %rsi
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -1(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit10Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$6, %xmm6
+	mov	-2(%rcx), %r9
 	mov	$6, %rsi
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -2(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit11Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$5, %xmm6
+	mov	-3(%rcx), %r9
 	mov	$5, %rsi
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, -3(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit12Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit13Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$3, %xmm6
+	mov	-1(%rcx), %r9d
 	mov	$3, %rsi
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -1(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit14Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$2, %xmm6
+	mov	-2(%rcx), %r9d
 	mov	$2, %rsi
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -2(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyExit15Case2OrCase3):
-	movaps	(%rdx), %xmm6
-	psrldq	$1, %xmm6
+	mov	-3(%rcx), %r9d
 	mov	$1, %rsi
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, -3(%rdx)
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16BytesCase2)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave1):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit1)
 	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	31(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
+	palignr	$1, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	31+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit1)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit1):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$15, %xmm6
-	palignr	$1, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	15(%rsi), %rsi
+	lea	15(%rdx, %rsi), %rdx
+	lea	15(%rcx, %rsi), %rcx
+	mov	-15(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -15(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave2):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit2)
 	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	30(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
+	palignr	$2, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	30+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit2)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit2):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$14, %xmm6
-	palignr	$2, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	14(%rsi), %rsi
+	lea	14(%rdx, %rsi), %rdx
+	lea	14(%rcx, %rsi), %rcx
+	mov	-14(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -14(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave3):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit3)
 	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	29(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
+	palignr	$3, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	29+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit3)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit3):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$13, %xmm6
-	palignr	$3, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	13(%rsi), %rsi
+	lea	13(%rdx, %rsi), %rdx
+	lea	13(%rcx, %rsi), %rcx
+	mov	-13(%rcx), %rsi
+	mov	-8(%rcx), %rax
+	mov	%rsi, -13(%rdx)
+	mov	%rax, -8(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave4):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit4)
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	28+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit4)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit4):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$12, %xmm6
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	12(%rsi), %rsi
+	lea	12(%rdx, %rsi), %rdx
+	lea	12(%rcx, %rsi), %rcx
+	mov	-12(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -12(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave5):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit5)
 	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	27(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
+	palignr	$5, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	27+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit5)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit5):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$11, %xmm6
-	palignr	$5, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	11(%rsi), %rsi
+	lea	11(%rdx, %rsi), %rdx
+	lea	11(%rcx, %rsi), %rcx
+	mov	-11(%rcx), %rsi
+	mov	-4(%rcx), %eax
+	mov	%rsi, -11(%rdx)
+	mov	%eax, -4(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave6):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit6)
 	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	26(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
+	palignr	$6, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	26+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit6)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit6):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$10, %xmm6
-	palignr	$6, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	10(%rsi), %rsi
+	lea	10(%rdx, %rsi), %rdx
+	lea	10(%rcx, %rsi), %rcx
+	mov	-10(%rcx), %rsi
+	movw	-2(%rcx), %ax
+	mov	%rsi, -10(%rdx)
+	movw	%ax, -2(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave7):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit7)
 	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	25(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
+	palignr	$7, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	25+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit7)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit7):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$9, %xmm6
-	palignr	$7, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	9(%rsi), %rsi
+	lea	9(%rdx, %rsi), %rdx
+	lea	9(%rcx, %rsi), %rcx
+	mov	-9(%rcx), %rsi
+	movb	-1(%rcx), %ah
+	mov	%rsi, -9(%rdx)
+	movb	%ah, -1(%rdx)
+	xor	%rsi, %rsi
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave8):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit8)
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	24+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit8)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit8):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$8, %xmm6
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	8(%rsi), %rsi
+	lea	8(%rdx, %rsi), %rdx
+	lea	8(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave9):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit9)
 	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	23(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
+	palignr	$9, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	23+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit9)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit9):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$7, %xmm6
-	palignr	$9, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	7(%rsi), %rsi
+	lea	7(%rdx, %rsi), %rdx
+	lea	7(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave10):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit10)
 	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	22(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
+	palignr	$10, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	22+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit10)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit10):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$6, %xmm6
-	palignr	$10, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	6(%rsi), %rsi
+	lea	6(%rdx, %rsi), %rdx
+	lea	6(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave11):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit11)
 	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	21(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
+	palignr	$11, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	21+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit11)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit11):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$5, %xmm6
-	palignr	$11, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	5(%rsi), %rsi
+	lea	5(%rdx, %rsi), %rdx
+	lea	5(%rcx, %rsi), %rcx
+	mov	-8(%rcx), %rax
+	xor	%rsi, %rsi
+	mov	%rax, -8(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave12):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit12)
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	20+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit12)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit12):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$4, %xmm6
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	4(%rsi), %rsi
+	lea	4(%rdx, %rsi), %rdx
+	lea	4(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave13):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit13)
 	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	19(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
+	palignr	$13, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	19+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit13)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit13):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$3, %xmm6
-	palignr	$13, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	3(%rsi), %rsi
+	lea	3(%rdx, %rsi), %rdx
+	lea	3(%rcx, %rsi), %rcx
+	mov	-4(%rcx), %eax
+	xor	%rsi, %rsi
+	mov	%eax, -4(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave14):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit14)
 	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	18(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
+	palignr	$14, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	18+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit14)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit14):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$2, %xmm6
-	palignr	$14, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	2(%rsi), %rsi
+	lea	2(%rdx, %rsi), %rdx
+	lea	2(%rcx, %rsi), %rcx
+	movw	-2(%rcx), %ax
+	xor	%rsi, %rsi
+	movw	%ax, -2(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
 
+	.p2align 4
 L(StrncpyLeave15):
 	movaps	%xmm2, %xmm3
 	add	$48, %r8
 	jle	L(StrncpyExit15)
 	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	17(%rcx), %xmm2
 	lea	16(%rsi), %rsi
-	movaps	%xmm2, %xmm3
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
+	palignr	$15, %xmm3, %xmm2
 	movaps	%xmm2, 16(%rdx)
-	movaps	17+16(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	movaps	%xmm2, %xmm1
 	movaps	%xmm4, 32(%rdx)
 	lea	16(%rsi), %rsi
 	sub	$16, %r8
 	jbe	L(StrncpyExit15)
-	movaps	%xmm7, %xmm1
 	movaps	%xmm5, 48(%rdx)
 	lea	16(%rsi), %rsi
 	lea	-16(%r8), %r8
 
 L(StrncpyExit15):
-	movaps	(%rdx, %rsi), %xmm6
-	psrldq	$1, %xmm6
-	palignr	$15, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx, %rsi)
-	lea	1(%rsi), %rsi
+	lea	1(%rdx, %rsi), %rdx
+	lea	1(%rcx, %rsi), %rcx
+	movb	-1(%rcx), %ah
+	xor	%rsi, %rsi
+	movb	%ah, -1(%rdx)
 	jmp	L(CopyFrom1To16BytesCase3)
+
 # endif
 # ifndef USE_AS_STRCAT
 END (STRCPY)
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 4e292f3..477b2cb 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -21,8 +21,9 @@
 #ifndef NOT_IN_libc
 # include <sysdep.h>
 
-.text
+	.section .text.ssse3,"ax",@progbits
 ENTRY (__wcscpy_ssse3)
+
 	mov	%rsi, %rcx
 	mov	%rdi, %rdx
 
@@ -136,6 +137,7 @@ L(Align16Both):
 
 	mov	$-0x40, %rsi
 
+	.p2align 4
 L(Aligned64Loop):
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
@@ -205,7 +207,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -213,15 +214,14 @@ L(Shl4Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -233,7 +233,6 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -245,8 +244,7 @@ L(Shl4Start):
 	test	%rax, %rax
 	jnz	L(Shl4LoopExit)
 
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$4, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -259,6 +257,7 @@ L(Shl4Start):
 
 	movaps	-4(%rcx), %xmm1
 
+	.p2align 4
 L(Shl4LoopStart):
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
@@ -289,11 +288,9 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$12, %xmm6
+	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
-	palignr	$4, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	movdqu	%xmm1, -4(%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -309,7 +306,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -317,15 +313,14 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -337,7 +332,6 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -345,13 +339,11 @@ L(Shl8Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl8LoopExit)
 
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$8, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -364,6 +356,7 @@ L(Shl8Start):
 
 	movaps	-8(%rcx), %xmm1
 
+	.p2align 4
 L(Shl8LoopStart):
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
@@ -394,11 +387,9 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$8, %xmm6
+	mov	(%rcx), %r9
 	mov	$8, %rsi
-	palignr	$8, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9, (%rdx)
 	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
@@ -414,7 +405,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -422,15 +412,14 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
+	movaps	%xmm2, %xmm1
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
-	movaps	%xmm3, %xmm1
 
 	pcmpeqd	%xmm2, %xmm0
 	lea	16(%rdx), %rdx
@@ -442,7 +431,6 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -450,13 +438,11 @@ L(Shl12Start):
 	lea	16(%rdx), %rdx
 	pmovmskb %xmm0, %rax
 	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
 
 	test	%rax, %rax
 	jnz	L(Shl12LoopExit)
 
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm3, %xmm1
+	palignr	$12, %xmm3, %xmm2
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -469,6 +455,7 @@ L(Shl12Start):
 
 	movaps	-12(%rcx), %xmm1
 
+	.p2align 4
 L(Shl12LoopStart):
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
@@ -498,11 +485,10 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
-	movaps	(%rdx), %xmm6
-	psrldq	$4, %xmm6
+	mov	(%rcx), %r9d
 	mov	$4, %rsi
-	palignr	$12, %xmm1, %xmm6
-	movaps	%xmm6, (%rdx)
+	mov	%r9d, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
 
 	.p2align 4
 L(CopyFrom1To16Bytes):
@@ -556,8 +542,10 @@ L(Exit12):
 
 	.p2align 4
 L(Exit16):
-	movdqu	(%rcx), %xmm0
-	movdqu	%xmm0, (%rdx)
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
 	mov	%rdi, %rax
 	ret
 

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                               |    8 +
 sysdeps/x86_64/multiarch/strcpy-ssse3.S |  767 ++++++++++++-------------------
 sysdeps/x86_64/multiarch/wcscpy-ssse3.S |   64 +--
 3 files changed, 331 insertions(+), 508 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]