This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] PowerPC - Optimization for str[n]casecmp functions


On 10/29/2011 02:03 PM, Ulrich Drepper wrote:
> On Tue, Oct 25, 2011 at 15:04, Adhemerval Zanella
> <azanella@linux.vnet.ibm.com> wrote:
>> This patch provides throughput boost for the strcasecmp/strncasecmp
>> functions for POWER7 for both ppc32 (25%) and ppc64 (40%),
> If it does, then fix the compiler to automatically do this.  Add
> options for the ppc builds.  There i no reason whatsoever to add this
> code as it is identical to the generic code.
>
I have tried to make GCC emit good code for the function strcasecmp by adding an option 
to force loop unrolling (GCC does not unroll loops with branches, even with 
-funroll-all-loops) and although the resulting code was better I still could see some 
room for improvements.

For the strcasecmp algorithm GCC unrolls loops by adding arithmetic instructions to 
update the string descriptors, while a better version would be by using loads with 
offsets (lbz r1, offset(r2)) and just updating the loops at end. While this is the best
option for this algorithm it might not the best option in general.

So I am providing an assembly version that is slight better than the C coded one with
manually loop unrolling. For PPC32 the loop unrolling is straightforward: loads with
offsets and descriptor updating at then end. For PPC64 I added some pipelining 
optimizations: the loop is dislocated to 'cmpdi' instruction (to make avoid branch
after a branch test instruction) and the loop stop test is done with only one branch 
instruction.

In the other hand, for strncasecmp the size constraint gives the compiler a much
better hint on how to generate the best code. So the option '-funroll-loop' already
produces a good assembly version.

--

PowerPC - Optimization for str[n]casecmp functions

This patch provides throughput boost for the strcasecmp function (25% on ppc32
and 40% on ppc64) and strncasecmp (15% on both ppc32 and ppc64) for POWER7.
The optimization is done by manually (strcasecmp) or automatically (strncasecmp)
unrolling the test loop to avoid CPU stalls caused by a test followed by a load.

2011-11-22  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

	* sysdeps/powerpc/Makefile: Added locale-defines.sym generation.
	* sysdeps/powerpc/locale-defines.sym: Locale definitions for strcasecmp
	optimized code.
	* sysdeps/powerpc/powerpc32/power7/Makefile: New file: added unroll-loop
	option for strncasecmp/strncasecmp_l compilation.
	* sysdeps/powerpc/powerpc32/power7/strcasecmp.S: New file: strcasecmp
	optimization for PPC32.
	* sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S: New file: strcasecmp_l
	optimization for PPC32.
	* sysdeps/powerpc/powerpc64/power7/Makefile: Added unroll-loop option for
	strncasecmp/strncasecmp_l compilation.
	* sysdeps/powerpc/powerpc64/power7/strcasecmp.S: New file: strcasecmp
	optimization for PPC64.
	* sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S: New file: strcasecmp_l
	optimization for PPC64.

diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile
index e43ca70..23a9a16 100644
--- a/sysdeps/powerpc/Makefile
+++ b/sysdeps/powerpc/Makefile
@@ -23,4 +23,6 @@ endif
 ifeq ($(subdir),csu)
 # get offset to rtld_global._dl_hwcap
 gen-as-const-headers += rtld-global-offsets.sym
+# get offset to __locale_struct.__ctype_tolower
+gen-as-const-headers += locale-defines.sym
 endif
diff --git a/sysdeps/powerpc/locale-defines.sym b/sysdeps/powerpc/locale-defines.sym
new file mode 100644
index 0000000..af64b92
--- /dev/null
+++ b/sysdeps/powerpc/locale-defines.sym
@@ -0,0 +1,5 @@
+#include <locale/localeinfo.h>
+
+--
+
+LOCALE_CTYPE_TOLOWER	offsetof (struct __locale_struct, __ctype_tolower)
diff --git a/sysdeps/powerpc/powerpc32/power7/Makefile b/sysdeps/powerpc/powerpc32/power7/Makefile
new file mode 100644
index 0000000..5e8f4a2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
new file mode 100644
index 0000000..5d84fce
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
@@ -0,0 +1,132 @@
+/* Optimized strcasecmp implementation for PowerPC32.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte readed from 1st string */
+#define rCHAR2	r7	/* Byte readed from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Byte tolower(rCHAR1) */
+#define rLWR2	r12	/* Byte tolower(rCHAR2) */
+#define rTMP	r0
+#define rGOT	r9	/* Address of the Global Offset Table */
+#define rLOC	r11	/* Default locale address */
+
+	cmpw    cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+# ifdef SHARED
+	mflr	rTMP
+	bcl	20,31,.L1
+.L1:	mflr	rGOT
+	addis	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha
+	addi 	rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add 	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+	mtlr	rTMP
+# else
+	lis	rTMP,_GLOBAL_OFFSET_TABLE_@ha
+	la	rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP)
+	lwz	rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+	add	rLOC, rLOC, __libc_tsd_LOCALE@tls
+	lwz	rLOC, 0(rLOC)
+# endif /* SHARED */
+#else
+	mr	rLOC, rLOCARG
+#endif
+	mr	rSTR1, rRTN
+	lwz	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	li	rRTN, 0
+	beqlr	cr7
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+L(loop):
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpwi	cr7, rCHAR1, 0		/* *s1 == '\0' ? */
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	beqlr	cr7
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpwi	cr7, rCHAR1, 0
+	subf.	r3, rLWR2, rLWR1
+	bnelr
+	bne	cr7,L(loop)
+	blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
new file mode 100644
index 0000000..c13c4eb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile
index b0f4520..40aacfa 100644
--- a/sysdeps/powerpc/powerpc64/power7/Makefile
+++ b/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -3,3 +3,8 @@ ifeq ($(subdir),elf)
 # optimization may require a TOC reference before relocations are resolved.
 CFLAGS-rtld.c += -mno-vsx
 endif
+
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000..1477b2e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,125 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+	CALL_MCOUNT 2
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte readed from 1st string */
+#define rCHAR2	r7	/* Byte readed from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+#else
+	mr	rLOC, rLOCARG
+#endif
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1,L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	b	L(loop)
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000..c13c4eb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
-- 
1.6.0.2



Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]