This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] PPC64 strchr and strlen optimizations


Completed the 64-bit optimization of strchr.S and strlen.S for PPC64.

2003-04-04  Steven Munroe  <sjmunroe at us dot ibm dot com>
        * sysdeps/powerpc/powerpc64/strchr.S: 64-bit optimizations.
        * sysdeps/powerpc/powerpc64/strlen.S: 64-bit optimizations.
-- 
Steven Munroe
sjmunroe at us dot ibm dot com
Linux on PowerPC-64 Development
GLIBC for PowerPC-64 Development
diff -urN libc23-cvstip-20030402/sysdeps/powerpc/powerpc64/strchr.S libc23/sysdeps/powerpc/powerpc64/strchr.S
--- libc23-cvstip-20030402/sysdeps/powerpc/powerpc64/strchr.S	2002-09-17 18:50:02.000000000 -0500
+++ libc23/sysdeps/powerpc/powerpc64/strchr.S	2003-04-04 09:55:59.000000000 -0600
@@ -1,5 +1,5 @@
 /* Optimized strchr implementation for PowerPC64.
-   Copyright (C) 1997, 1999, 2000, 2002 Free Software Foundation, Inc.
+   Copyright (C) 1997, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -29,6 +29,11 @@
 
 #define rTMP1	r0
 #define rRTN	r3	/* outgoing result */
+/* Note:  The Bounded pointer support in this code is broken.  This code
+   was inherited from PPC32 and and that support was never completed.  
+   Currently PPC gcc does not support -fbounds-check or -fbounded-pointers.
+   These artifacts are left in the code as a reminder in case we need
+   bounded pointer support in the future.  */
 #if __BOUNDED_POINTERS__
 # define rSTR	r4
 # define rCHR	r5	/* byte we're looking for, spread over the whole word */
@@ -39,8 +44,8 @@
 # define rWORD	r5	/* the current word */
 #endif
 #define rCLZB	rCHR	/* leading zero byte count */
-#define rFEFE	r6	/* constant 0xfefefeff (-0x01010101) */
-#define r7F7F	r7	/* constant 0x7f7f7f7f */
+#define rFEFE	r6	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F	r7	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rTMP2	r9
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
@@ -49,18 +54,23 @@
 	CHECK_BOUNDS_LOW (rSTR, rTMP1, rTMP2)
 	STORE_RETURN_BOUNDS (rTMP1, rTMP2)
 
+	dcbt	0,rRTN
 	rlwimi	rCHR, rCHR, 8, 16, 23
 	li	rMASK, -1
 	rlwimi	rCHR, rCHR, 16, 0, 15
-	rlwinm	rIGN, rRTN, 3, 27, 28
+	rlwinm	rIGN, rRTN, 3, 26, 28
+	insrdi	rCHR, rCHR, 32, 0
 	lis	rFEFE, -0x101
 	lis	r7F7F, 0x7f7f
-	clrrdi	rSTR, rRTN, 2
+	clrrdi	rSTR, rRTN, 3
 	addi	rFEFE, rFEFE, -0x101
 	addi	r7F7F, r7F7F, 0x7f7f
+	sldi	rTMP1, rFEFE, 32
+	insrdi	r7F7F, r7F7F, 32, 0
+	add	rFEFE, rFEFE, rTMP1
 /* Test the first (partial?) word.  */
-	lwz	rWORD, 0(rSTR)
-	srw	rMASK, rMASK, rIGN
+	ld	rWORD, 0(rSTR)
+	srd	rMASK, rMASK, rIGN
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
@@ -71,7 +81,7 @@
 
 /* The loop.  */
 
-L(loop):lwzu rWORD, 4(rSTR)
+L(loop):ldu rWORD, 8(rSTR)
 	and.	rTMP1, rTMP1, rTMP2
 /* Test for 0.	*/
 	add	rTMP1, rFEFE, rWORD
@@ -104,12 +114,12 @@
 	add	rTMP1, rTMP1, r7F7F
 	nor	rWORD, rMASK, rFEFE
 	nor	rTMP2, rIGN, rTMP1
-	cmplw	rWORD, rTMP2
+	cmpld	rWORD, rTMP2
 	bgtlr
-	cntlzw	rCLZB, rTMP2
-	srwi	rCLZB, rCLZB, 3
+	cntlzd	rCLZB, rTMP2
+	srdi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
-	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
+	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
 	STORE_RETURN_VALUE (rSTR)
 	blr
 
@@ -118,11 +128,11 @@
 	or	rIGN, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
 	nor	rTMP2, rIGN, rTMP1
-	cntlzw	rCLZB, rTMP2
-	subi	rSTR, rSTR, 4
-	srwi	rCLZB, rCLZB, 3
+	cntlzd	rCLZB, rTMP2
+	subi	rSTR, rSTR, 8
+	srdi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
-	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, twlge)
+	CHECK_BOUNDS_HIGH_RTN (rSTR, rTMP2, tdlge)
 	STORE_RETURN_VALUE (rSTR)
 	blr
 END (BP_SYM (strchr))
diff -urN libc23-cvstip-20030402/sysdeps/powerpc/powerpc64/strlen.S libc23/sysdeps/powerpc/powerpc64/strlen.S
--- libc23-cvstip-20030402/sysdeps/powerpc/powerpc64/strlen.S	2002-09-17 18:50:02.000000000 -0500
+++ libc23/sysdeps/powerpc/powerpc64/strlen.S	2003-04-04 09:55:21.000000000 -0600
@@ -1,5 +1,5 @@
 /* Optimized strlen implementation for PowerPC64.
-   Copyright (C) 1997, 1999, 2000, 2002 Free Software Foundation, Inc.
+   Copyright (C) 1997, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -60,7 +60,12 @@
    2) How popular are bytes with the high bit set? If they are very rare,
    on some processors it might be useful to use the simpler expression
    ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one
-   ALU), but this fails when any character has its high bit set.  */
+   ALU), but this fails when any character has its high bit set.  
+   
+   Answer:
+   1) Added a Data Cache Block Touch early to prefetch the first 128 
+   byte cache line. Adding dcbt instructions to the loop would not be 
+   effective since most strings will be shorter than the cache line.*/
 
 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
    0 and 3 through 12 (so long as we don't call any procedures) without
@@ -80,63 +85,68 @@
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
 			   string to make it start at a word boundary */
-#define rFEFE	r6	/* constant 0xfefefeff (-0x01010101) */
-#define r7F7F	r7	/* constant 0x7f7f7f7f */
-#define rWORD1	r8	/* current string word */
-#define rWORD2	r9	/* next string word */
-#define rMASK	r9	/* mask for first string word */
+#define rFEFE	r6	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F	r7	/* constant 0x7f7f7f7f7f7f7f7f */
+#define rWORD1	r8	/* current string doubleword */
+#define rWORD2	r9	/* next string doubleword */
+#define rMASK	r9	/* mask for first string doubleword */
 #define rTMP2	r10
 #define rTMP3	r11
 #define rTMP4	r12
 
+/* Note:  The Bounded pointer support in this code is broken.  This code
+   was inherited from PPC32 and and that support was never completed.  
+   Current PPC gcc does not support -fbounds-check or -fbounded-pointers.
+   These artifacts are left in the code as a reminder in case we need
+   bounded pointer support in the future.  */
 	CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2)
 
-	clrrdi	rSTR, rRTN, 2
+	dcbt	0,rRTN
+	clrrdi	rSTR, rRTN, 3
 	lis	r7F7F, 0x7f7f
-	rlwinm	rPADN, rRTN, 3, 27, 28
-	lwz	rWORD1, 0(rSTR)
-	li	rMASK, -1
+	rlwinm	rPADN, rRTN, 3, 26, 28
+	ld	rWORD1, 0(rSTR)
 	addi	r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
-   We make an exception and use method (2) on the first two words, to reduce
-   overhead.  */
-	srw	rMASK, rMASK, rPADN
+	li	rMASK, -1
+	insrdi	r7F7F, r7F7F, 32, 0
+/* That's the setup done, now do the first pair of doublewords.
+   We make an exception and use method (2) on the first two doublewords, 
+   to reduce overhead.  */
+	srd	rMASK, rMASK, rPADN
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
+	lis	rFEFE, -0x101
 	add	rTMP1, rTMP1, r7F7F
+	addi	rFEFE, rFEFE, -0x101
 	nor	rTMP1, rTMP2, rTMP1
 	and.	rWORD1, rTMP1, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
-	lis	rFEFE, -0x101
-	addi	rFEFE, rFEFE, -0x101
-	clrldi	rFEFE,rFEFE,32 /* clear upper 32 */
+	sldi  rTMP1, rFEFE, 32
+	add  rFEFE, rFEFE, rTMP1
 /* Are we now aligned to a doubleword boundary?  */
-	bt	29, L(loop)
+	bt	28, L(loop)
 
-/* Handle second word of pair.  */
-	lwzu	rWORD1, 4(rSTR)
+/* Handle second doubleword of pair.  */
+	ldu	rWORD1, 8(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
 	nor.	rWORD1, rTMP2, rTMP1
-	clrldi.	rWORD1,rWORD1,32 /* clear upper 32 */
 	bne	L(done0)
 
 /* The loop.  */
 
 L(loop):
-	lwz	rWORD1, 4(rSTR)
-	lwzu	rWORD2, 8(rSTR)
+	ld	rWORD1, 8(rSTR)
+	ldu	rWORD2, 16(rSTR)
 	add	rTMP1, rFEFE, rWORD1
 	nor	rTMP2, r7F7F, rWORD1
 	and.	rTMP1, rTMP1, rTMP2
-	clrldi.	rTMP1,rTMP1,32 /* clear upper 32 */
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
 	and.	rTMP1, rTMP3, rTMP4
-	clrldi.	rTMP1,rTMP1,32 /* clear upper 32 */
 	beq	L(loop)
 
 	and	rTMP1, r7F7F, rWORD2
@@ -146,17 +156,17 @@
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
-	subi	rSTR, rSTR, 4
+	subi	rSTR, rSTR, 8
 	add	rTMP1, rTMP1, r7F7F
 	andc	rWORD1, rTMP2, rTMP1
 
-/* When we get to here, rSTR points to the first word in the string that
+/* When we get to here, rSTR points to the first doubleword in the string that
    contains a zero byte, and the most significant set bit in rWORD1 is in that
    byte.  */
 L(done0):
-	cntlzw	rTMP3, rWORD1
+	cntlzd	rTMP3, rWORD1
 	subf	rTMP1, rRTN, rSTR
-	srwi	rTMP3, rTMP3, 3
+	srdi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	/* GKM FIXME: check high bound.  */
 	blr

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]