This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] PowerPC: stpcpy optimization for PPC64/POWER7


Hi all,

Following Alan Modra suggestion, it is a stpcpy optimization patch for PPC64.
This patch optimizes the default PPC64 by adding doubleword stores/loads
increasing aligned throughput for large sizes.

For POWER7 version it also removed unneeded branch prediction and use cmpb
instructions instead of the bitwise operation to find string's end. This saved
some cycles for both aligned and unaligned cases.

Tested on PPC64 power4/power7 and I'm attaching the benchtests output for each
case (default master, default optimized, power7 master, and power7 optimized).

---

2013-09-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

        * sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Add doubleword read
	and write to provide a boost for large inputs.
	* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file.

--

diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
new file mode 100644
index 0000000..3b44df2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
@@ -0,0 +1,156 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* See strlen.s for comments on how the end-of-string testing works.  */
+
+/* char * [r3] stpcpy (char *dest [r3], const char *src [r4])  */
+
+	.machine  power7
+EALIGN (__stpcpy, 4, 0)
+	CALL_MCOUNT 2
+
+#define rTMP	r0
+#define rRTN	r3	/* pointer to previous word/doubleword in dest */
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
+#define rMASK	r5	/* mark 0xffffffff | 0xffffffffffffffff */
+#define rWORD	r6	/* current word from src */
+#define rALT	r7	/* alternate word from src */
+
+	or	rTMP, rSRC, rRTN
+	clrldi.	rTMP, rTMP, 61
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	li	rMASK, 0
+	addi	rRTN, rRTN, -8
+	ld	rWORD, 0(rSRC)
+	b	L(g2)
+
+	.align 4
+L(g0):	ldu	rALT, 8(rSRC)
+	stdu	rWORD, 8(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpdi	rTMP, 0
+	bne	L(g1)
+	ldu	rWORD, 8(rSRC)
+	stdu	rALT, 8(rRTN)
+L(g2):	cmpb	rTMP, rWORD, rMASK
+	cmpdi	rTMP, 0		/* If rTMP is 0, no null's have been found.  */
+	beq	L(g0)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g1):
+	extrdi.	rTMP, rALT, 8, 0
+	stbu	rTMP, 8(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	stbu	rALT, 1(rRTN)
+	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and
+   stores.  */
+	li	rMASK, 0
+	addi	rRTN, rRTN, -4
+	lwz	rWORD, 0(rSRC)
+	b	L(g5)
+
+	.align	4
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpwi	rTMP, 0
+	bne	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	cmpb	rTMP, rWORD, rMASK
+	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
+	beq	L(g3)
+
+        mr      rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+	rlwinm. rTMP, rALT, 8, 24, 31
+        stbu    rTMP, 4(rRTN)
+        beqlr
+        rlwinm. rTMP, rALT, 16, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr
+        rlwinm. rTMP, rALT, 24, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr
+        stbu    rALT, 1(rRTN)
+        blr
+
+/* Oh well.  In this case, we just do a byte-by-byte copy.  */
+	.align	4
+L(unaligned):
+	lbz	rWORD, 0(rSRC)
+	addi	rRTN, rRTN, -1
+	cmpdi	rWORD, 0
+	beq	L(u2)
+
+	.align 	5
+L(u0):	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	beq	L(u2)
+	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	bne	L(u0)
+L(u2):	stbu	rWORD, 1(rRTN)
+	blr
+L(u1):	stbu	rALT, 1(rRTN)
+	blr
+END (__stpcpy)
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
index 070cd46..d3db070 100644
--- a/sysdeps/powerpc/powerpc64/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/stpcpy.S
@@ -26,35 +26,40 @@ EALIGN (__stpcpy, 4, 0)
 	CALL_MCOUNT 2
 
 #define rTMP	r0
-#define rRTN	r3
-#define rDEST	r3		/* pointer to previous word in dest */
-#define rSRC	r4		/* pointer to previous word in src */
-#define rWORD	r6		/* current word from src */
-#define rFEFE	r7		/* 0xfefefeff */
-#define r7F7F	r8		/* 0x7f7f7f7f */
-#define rNEG	r9		/* ~(word in src | 0x7f7f7f7f) */
-#define rALT	r10		/* alternate word from src */
-
-	or	rTMP, rSRC, rDEST
-	clrldi.	rTMP, rTMP, 62
-	addi	rDEST, rDEST, -4
-	bne	L(unaligned)
+#define rRTN	r3	/* pointer to previous word/doubleword in dest */
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
+#define rWORD	r6	/* current word from src */
+#define rFEFE	r7	/* constant 0xfefefeff | 0xfefefefefefefeff */
+#define r7F7F	r8	/* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r9	/* ~(word in s1 | r7F7F) */
+#define rALT	r10	/* alternate word from src */
+
+	or	rTMP, rSRC, rRTN
+	clrldi.	rTMP, rTMP, 61
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and
+   stores.  */
+	addi	rRTN, rRTN, -8
 
 	lis	rFEFE, -0x101
 	lis	r7F7F, 0x7f7f
-	lwz	rWORD, 0(rSRC)
+	ld	rWORD, 0(rSRC)
 	addi	rFEFE, rFEFE, -0x101
 	addi	r7F7F, r7F7F, 0x7f7f
+	sldi	rTMP, rFEFE, 32
+	insrdi	r7F7F, r7F7F, 32, 0
+	add	rFEFE, rFEFE, rTMP
 	b	L(g2)
 
-L(g0):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rDEST)
+L(g0):	ldu	rALT, 8(rSRC)
+	stdu	rWORD, 8(rRTN)
 	add	rTMP, rFEFE, rALT
 	nor	rNEG, r7F7F, rALT
 	and.	rTMP, rTMP, rNEG
 	bne-	L(g1)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rDEST)
+	ldu	rWORD, 8(rSRC)
+	stdu	rALT, 8(rRTN)
 L(g2):	add	rTMP, rFEFE, rWORD
 	nor	rNEG, r7F7F, rWORD
 	and.	rTMP, rTMP, rNEG
@@ -62,39 +67,95 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 4(rDEST)
+L(g1):
+	extrdi.	rTMP, rALT, 8, 0
+	stbu	rTMP, 8(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rDEST)
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rDEST)
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	stbu	rALT, 1(rDEST)
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	stbu	rALT, 1(rRTN)
 	blr
 
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and
+   stores.  */
+	addi	rRTN, rRTN, -4
+
+	lis	rFEFE, -0x101
+	lis	r7F7F, 0x7f7f
+	lwz	rWORD, 0(rSRC)
+	addi	rFEFE, rFEFE, -0x101
+	addi	r7F7F, r7F7F, 0x7f7f
+	b	L(g5)
+
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	add	rTMP, rFEFE, rALT
+	nor	rNEG, r7F7F, rALT
+	and.	rTMP, rTMP, rNEG
+	bne-	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	add	rTMP, rFEFE, rWORD
+	nor	rNEG, r7F7F, rWORD
+	and.	rTMP, rTMP, rNEG
+	beq+	L(g3)
+
+        mr      rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+	rlwinm. rTMP, rALT, 8, 24, 31
+        stbu    rTMP, 4(rRTN)
+        beqlr-
+        rlwinm. rTMP, rALT, 16, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr-
+        rlwinm. rTMP, rALT, 24, 24, 31
+        stbu    rTMP, 1(rRTN)
+        beqlr-
+        stbu    rALT, 1(rRTN)
+        blr
+
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
 	nop
 L(unaligned):
 	lbz	rWORD, 0(rSRC)
-	addi	rDEST, rDEST, 3
+	addi	rRTN, rRTN, -1
 	cmpwi	rWORD, 0
 	beq-	L(u2)
 
 L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rDEST)
+	stbu	rWORD, 1(rRTN)
 	cmpwi	rALT, 0
 	beq-	L(u1)
 	nop		/* Let 601 load start of loop.  */
 	lbzu	rWORD, 1(rSRC)
-	stbu	rALT, 1(rDEST)
+	stbu	rALT, 1(rRTN)
 	cmpwi	rWORD, 0
 	bne+	L(u0)
-L(u2):	stbu	rWORD, 1(rDEST)
+L(u2):	stbu	rWORD, 1(rRTN)
 	blr
-L(u1):	stbu	rALT, 1(rDEST)
+L(u1):	stbu	rALT, 1(rRTN)
 	blr
 END (__stpcpy)

Attachment: bench-stpcpy-master-power4.out
Description: Text document

Attachment: bench-stpcpy-master-power7.out
Description: Text document

Attachment: bench-stpcpy-patch-power4.out
Description: Text document

Attachment: bench-stpcpy-patch-power7.out
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]