This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] PowerPC - Optimization for str[n]casecmp functions
- From: Adhemerval Zanella <azanella at linux dot vnet dot ibm dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Date: Tue, 22 Nov 2011 18:30:50 -0200
- Subject: Re: [PATCH] PowerPC - Optimization for str[n]casecmp functions
- References: <4EA70840.2030002@linux.vnet.ibm.com> <CAOPLpQeE+8b_KVbX366vnqEh7-O5ejYsSKRhs1YC6HBmF3B9YA@mail.gmail.com>
On 10/29/2011 02:03 PM, Ulrich Drepper wrote:
> On Tue, Oct 25, 2011 at 15:04, Adhemerval Zanella
> <azanella@linux.vnet.ibm.com> wrote:
>> This patch provides throughput boost for the strcasecmp/strncasecmp
>> functions for POWER7 for both ppc32 (25%) and ppc64 (40%),
> If it does, then fix the compiler to automatically do this. Add
> options for the ppc builds. There i no reason whatsoever to add this
> code as it is identical to the generic code.
>
I have tried to make GCC emit good code for the function strcasecmp by adding an option
to force loop unrolling (GCC does not unroll loops with branches, even with
-funroll-all-loops) and although the resulting code was better I still could see some
room for improvements.
For the strcasecmp algorithm GCC unrolls loops by adding arithmetic instructions to
update the string descriptors, while a better version would be by using loads with
offsets (lbz r1, offset(r2)) and just updating the loops at end. While this is the best
option for this algorithm it might not the best option in general.
So I am providing an assembly version that is slight better than the C coded one with
manually loop unrolling. For PPC32 the loop unrolling is straightforward: loads with
offsets and descriptor updating at then end. For PPC64 I added some pipelining
optimizations: the loop is dislocated to 'cmpdi' instruction (to make avoid branch
after a branch test instruction) and the loop stop test is done with only one branch
instruction.
In the other hand, for strncasecmp the size constraint gives the compiler a much
better hint on how to generate the best code. So the option '-funroll-loop' already
produces a good assembly version.
--
PowerPC - Optimization for str[n]casecmp functions
This patch provides throughput boost for the strcasecmp function (25% on ppc32
and 40% on ppc64) and strncasecmp (15% on both ppc32 and ppc64) for POWER7.
The optimization is done by manually (strcasecmp) or automatically (strncasecmp)
unrolling the test loop to avoid CPU stalls caused by a test followed by a load.
2011-11-22 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/Makefile: Added locale-defines.sym generation.
* sysdeps/powerpc/locale-defines.sym: Locale definitions for strcasecmp
optimized code.
* sysdeps/powerpc/powerpc32/power7/Makefile: New file: added unroll-loop
option for strncasecmp/strncasecmp_l compilation.
* sysdeps/powerpc/powerpc32/power7/strcasecmp.S: New file: strcasecmp
optimization for PPC32.
* sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S: New file: strcasecmp_l
optimization for PPC32.
* sysdeps/powerpc/powerpc64/power7/Makefile: Added unroll-loop option for
strncasecmp/strncasecmp_l compilation.
* sysdeps/powerpc/powerpc64/power7/strcasecmp.S: New file: strcasecmp
optimization for PPC64.
* sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S: New file: strcasecmp_l
optimization for PPC64.
diff --git a/sysdeps/powerpc/Makefile b/sysdeps/powerpc/Makefile
index e43ca70..23a9a16 100644
--- a/sysdeps/powerpc/Makefile
+++ b/sysdeps/powerpc/Makefile
@@ -23,4 +23,6 @@ endif
ifeq ($(subdir),csu)
# get offset to rtld_global._dl_hwcap
gen-as-const-headers += rtld-global-offsets.sym
+# get offset to __locale_struct.__ctype_tolower
+gen-as-const-headers += locale-defines.sym
endif
diff --git a/sysdeps/powerpc/locale-defines.sym b/sysdeps/powerpc/locale-defines.sym
new file mode 100644
index 0000000..af64b92
--- /dev/null
+++ b/sysdeps/powerpc/locale-defines.sym
@@ -0,0 +1,5 @@
+#include <locale/localeinfo.h>
+
+--
+
+LOCALE_CTYPE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
diff --git a/sysdeps/powerpc/powerpc32/power7/Makefile b/sysdeps/powerpc/powerpc32/power7/Makefile
new file mode 100644
index 0000000..5e8f4a2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
new file mode 100644
index 0000000..5d84fce
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp.S
@@ -0,0 +1,132 @@
+/* Optimized strcasecmp implementation for PowerPC32.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+ or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+ int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+ __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+
+#define rRTN r3 /* Return value */
+#define rSTR1 r5 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rLOCARG r5 /* 3rd argument: locale_t */
+#define rCHAR1 r6 /* Byte readed from 1st string */
+#define rCHAR2 r7 /* Byte readed from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Byte tolower(rCHAR1) */
+#define rLWR2 r12 /* Byte tolower(rCHAR2) */
+#define rTMP r0
+#define rGOT r9 /* Address of the Global Offset Table */
+#define rLOC r11 /* Default locale address */
+
+ cmpw cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+# ifdef SHARED
+ mflr rTMP
+ bcl 20,31,.L1
+.L1: mflr rGOT
+ addis rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@ha
+ addi rGOT, rGOT, _GLOBAL_OFFSET_TABLE_-.L1@l
+ lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+ add rLOC, rLOC, __libc_tsd_LOCALE@tls
+ lwz rLOC, 0(rLOC)
+ mtlr rTMP
+# else
+ lis rTMP,_GLOBAL_OFFSET_TABLE_@ha
+ la rLOC,_GLOBAL_OFFSET_TABLE_@l(rTMP)
+ lwz rLOC, __libc_tsd_LOCALE@got@tprel(rGOT)
+ add rLOC, rLOC, __libc_tsd_LOCALE@tls
+ lwz rLOC, 0(rLOC)
+# endif /* SHARED */
+#else
+ mr rLOC, rLOCARG
+#endif
+ mr rSTR1, rRTN
+ lwz rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ li rRTN, 0
+ beqlr cr7
+
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+
+L(loop):
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpwi cr7, rCHAR1, 0 /* *s1 == '\0' ? */
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ beqlr cr7
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpwi cr7, rCHAR1, 0
+ subf. r3, rLWR2, rLWR1
+ bnelr
+ bne cr7,L(loop)
+ blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
new file mode 100644
index 0000000..c13c4eb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/sysdeps/powerpc/powerpc64/power7/Makefile b/sysdeps/powerpc/powerpc64/power7/Makefile
index b0f4520..40aacfa 100644
--- a/sysdeps/powerpc/powerpc64/power7/Makefile
+++ b/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -3,3 +3,8 @@ ifeq ($(subdir),elf)
# optimization may require a TOC reference before relocations are resolved.
CFLAGS-rtld.c += -mno-vsx
endif
+
+ifeq ($(subdir),string)
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000..1477b2e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,125 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+ or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+ int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+ __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP strcasecmp
+#endif
+
+ENTRY (BP_SYM (__STRCMP))
+ CALL_MCOUNT 2
+
+#define rRTN r3 /* Return value */
+#define rSTR1 r5 /* 1st string */
+#define rSTR2 r4 /* 2nd string */
+#define rLOCARG r5 /* 3rd argument: locale_t */
+#define rCHAR1 r6 /* Byte readed from 1st string */
+#define rCHAR2 r7 /* Byte readed from 2nd string */
+#define rADDR1 r8 /* Address of tolower(rCHAR1) */
+#define rADDR2 r12 /* Address of tolower(rCHAR2) */
+#define rLWR1 r8 /* Word tolower(rCHAR1) */
+#define rLWR2 r12 /* Word tolower(rCHAR2) */
+#define rTMP r9
+#define rLOC r11 /* Default locale address */
+
+ cmpd cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+ ld rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+ add rLOC, rTMP, __libc_tsd_LOCALE@tls
+ ld rLOC, 0(rLOC)
+#else
+ mr rLOC, rLOCARG
+#endif
+ ld rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+ mr rSTR1, rRTN
+ li rRTN, 0
+ beqlr cr7
+
+
+ /* Unrolling loop for POWER: loads are done with 'lbz' plus
+ offset and string descriptors are only updated in the end
+ of loop unrolling. */
+
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+L(loop):
+ cmpdi rCHAR1, 0 /* *s1 == '\0' ? */
+ sldi rADDR1, rCHAR1, 2 /* Calculate address for tolower(*s1) */
+ sldi rADDR2, rCHAR2, 2 /* Calculate address for tolower(*s2) */
+ lwzx rLWR1, rLOC, rADDR1 /* Load tolower(*s1) */
+ lwzx rLWR2, rLOC, rADDR2 /* Load tolower(*s2) */
+ cmpw cr1, rLWR1, rLWR2 /* r = tolower(*s1) == tolower(*s2) ? */
+ crorc 4*cr1+eq,eq,4*cr1+eq /* (*s1 != '\0') || (r == 1) */
+ beq cr1, L(done)
+ lbz rCHAR1, 1(rSTR1)
+ lbz rCHAR2, 1(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 2(rSTR1)
+ lbz rCHAR2, 2(rSTR2)
+ cmpdi rCHAR1, 0
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1, L(done)
+ lbz rCHAR1, 3(rSTR1)
+ lbz rCHAR2, 3(rSTR2)
+ cmpdi rCHAR1, 0
+ /* Increment both string descriptors */
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+ sldi rADDR1, rCHAR1, 2
+ sldi rADDR2, rCHAR2, 2
+ lwzx rLWR1, rLOC, rADDR1
+ lwzx rLWR2, rLOC, rADDR2
+ cmpw cr1, rLWR1, rLWR2
+ crorc 4*cr1+eq,eq,4*cr1+eq
+ beq cr1,L(done)
+ lbz rCHAR1, 0(rSTR1) /* Load char from s1 */
+ lbz rCHAR2, 0(rSTR2) /* Load char from s2 */
+ b L(loop)
+L(done):
+ subf r0, rLWR2, rLWR1
+ extsw rRTN, r0
+ blr
+END (BP_SYM (__STRCMP))
+
+weak_alias (BP_SYM (__STRCMP), BP_SYM (STRCMP))
+libc_hidden_builtin_def (__STRCMP)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000..c13c4eb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
--
1.6.0.2