This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Mon, 7 Oct 2013 18:36:10 +0400
- Subject: Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone> <20130916123234 dot GA24928 at domone> <20131007130651 dot GD5515 at domone>
I don't see any performance results.
Did I miss it?
And we should check also for Haswell and Silvermont to make sure it is fine.
--
Liubov
Intel Corporation
On Mon, Oct 7, 2013 at 5:06 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> ping
> On Mon, Sep 16, 2013 at 02:32:34PM +0200, OndÅej BÃlka wrote:
>> On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
>> > Hi,
>> > I tried to gather data also for strcasecmp/strncasecmp and I got
>> > that they are used rarely on my system.
>> >
>> Thanks to Andreas I have a implementation ready.
>>
>> It works by first finding different characters with strcmp code, then
>> checking if their case differ. As it is likely that these characters
>> were different performance should be similar to strcmp one. I checked
>> this property on my computer with following code and number of case
>> comparisons needed is mostly 1 in my test:
>>
>> #include <stdio.h>
>> int strcasecmp(unsigned char *x,unsigned char *y)
>> {
>> int casecmp=0;
>> int i=0;
>> while(1) {
>> if (x[i]!=y[i])
>> if (tolower(x[i])==tolower(y[i]))
>> casecmp++;
>> else
>> {
>> fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
>> return tolower(x[i])-tolower(y[i]);
>> }
>> if (!x[i])
>> {
>> fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
>> return 0;
>> }
>> i++;
>> }
>> return 0;
>> }
>>
>> Downsite of this implementation is that checking aaaa vs AAAA will be
>> slower, as this looks as unlikely case we could make this tradeoff.
>>
>> I added it in generic way as I plan to add also ssse3 loop version which
>> will come in separate patch.
>>
>> * sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
>> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
>> Add strcasecmp_l-sse2-unaligned.
>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
>> strcasecmp_sse2_unaligned.
>> * sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
>> * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
>> implementation.
>> * sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.
>>
>> ---
>> sysdeps/x86_64/locale-defines.sym | 1 +
>> sysdeps/x86_64/multiarch/Makefile | 1 +
>> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 +
>> .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 2 +
>> sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 117 +++++++++++++++++++++
>> sysdeps/x86_64/multiarch/strcmp.S | 9 +-
>> 6 files changed, 127 insertions(+), 5 deletions(-)
>> create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>>
>> diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
>> index aebff9a..804debb 100644
>> --- a/sysdeps/x86_64/locale-defines.sym
>> +++ b/sysdeps/x86_64/locale-defines.sym
>> @@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
>> LC_CTYPE
>> _NL_CTYPE_NONASCII_CASE
>> LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
>> +LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
>> SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
>> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> index 5ab950a..551923c 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>> + strcasecmp_l-sse2-unaligned \
>> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>> strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 1a65ac0..40f8895 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> __strcasecmp_avx)
>> IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
>> __strcasecmp_sse42)
>> + IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
>> + __strcasecmp_sse2_unaligned)
>> IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
>> __strcasecmp_ssse3)
>> IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>> new file mode 100644
>> index 0000000..62ce37e
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>> @@ -0,0 +1,2 @@
>> +#define AS_STRCASECMP
>> +#include "strcmp-sse2-unaligned.S"
>> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> index eed8432..c93d2f5 100644
>> --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> @@ -16,10 +16,33 @@
>> License along with the GNU C Library; if not, see
>> <http://www.gnu.org/licenses/>. */
>>
>> +#ifndef NOT_IN_libc
>> +
>> #include "sysdep.h"
>> #define ALIGN(x) .p2align x
>>
>> +#ifdef AS_STRCASECMP
>> +# include "locale-defines.h"
>> +
>> +#define __strcasecmp_sse2_unaligned strcasecmp_new
>> +
>> +# define __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
>> +ENTRY (__strcasecmp_sse2_unaligned)
>> + movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
>> + mov %fs:(%rax), %rdx
>> + // XXX 5 byte should be before the function
>> + /* 5-byte NOP. */
>> + .byte 0x0f,0x1f,0x44,0x00,0x00
>> +
>> +END (__strcasecmp_sse2_unaligned)
>> +
>> +#endif
>> +
>> ENTRY ( __strcmp_sse2_unaligned)
>> +
>> +#ifdef AS_STRCASECMP
>> + mov LOCALE_TOLOWER(%rdx), %r11
>> +#endif
>> movl %edi, %eax
>> xorl %edx, %edx
>> pxor %xmm7, %xmm7
>> @@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
>> pmovmskb %xmm0, %eax
>> testq %rax, %rax
>> je L(next_48_bytes)
>> +#ifndef AS_STRCASECMP
>> L(return):
>> bsfq %rax, %rdx
>> movzbl (%rdi, %rdx), %eax
>> movzbl (%rsi, %rdx), %edx
>> subl %edx, %eax
>> ret
>> +#else
>> + jmp L(caseloop1)
>> +#endif
>>
>> ALIGN (4)
>> L(next_48_bytes):
>> @@ -85,6 +112,76 @@ L(main_loop_header):
>> movq %rcx, %rsi
>> jmp L(loop_start)
>>
>> +#ifdef AS_STRCASECMP
>> +L(caseloop1):
>> + bsfq %rax, %rdx
>> + leaq -1(%rax), %rcx
>> + andq %rax, %rcx
>> + movzbl (%rdi, %rdx), %eax
>> + movzbl (%rsi, %rdx), %edx
>> + movl (%r11, %rax, 4), %eax
>> + movl (%r11, %rdx, 4), %edx
>> + testl %eax, %eax
>> + je L(zero1)
>> + cmpl %edx, %eax
>> + je L(casecnt1)
>> +L(zero1):
>> + subl %edx, %eax
>> + ret
>> +L(casecnt1):
>> + testq %rcx, %rcx
>> + je L(next_48_bytes)
>> + movq %rcx, %rax
>> + jmp L(caseloop1)
>> +
>> +L(return):
>> +L(caseloop2):
>> + bsfq %rax, %rdx
>> + leaq -1(%rax), %rcx
>> + andq %rax, %rcx
>> + movzbl (%rdi, %rdx), %eax
>> + movzbl (%rsi, %rdx), %edx
>> + movl (%r11, %rax, 4), %eax
>> + movl (%r11, %rdx, 4), %edx
>> + testl %eax, %eax
>> + je L(zero2)
>> + cmpl %edx, %eax
>> + je L(casecnt2)
>> +L(zero2):
>> + subl %edx, %eax
>> + ret
>> +L(casecnt2):
>> + testq %rcx, %rcx
>> + je L(main_loop_header)
>> + movq %rcx, %rax
>> + jmp L(caseloop2)
>> +
>> +L(caseloop3):
>> + bsfq %rax, %rdx
>> + leaq -1(%rax), %r10
>> + andq %rax, %r10
>> + movzbl (%rdi, %rdx), %eax
>> + movzbl (%rsi, %rdx), %edx
>> + movl (%r11, %rax, 4), %eax
>> + movl (%r11, %rdx, 4), %edx
>> + testl %eax, %eax
>> + je L(zero3)
>> + cmpl %edx, %eax
>> + je L(casecnt3)
>> +L(zero3):
>> + subl %edx, %eax
>> + ret
>> +L(casecnt3):
>> + movq %rdi, %rax
>> + movq %rsi, %rdx
>> + testq %r10, %r10
>> + je L(back_to_loop)
>> + movq %r10, %rax
>> + jmp L(caseloop3)
>> +
>> +#endif
>> +
>> +
>> ALIGN (4)
>> L(loop):
>> addq $64, %rax
>> @@ -135,11 +232,18 @@ L(back_to_loop):
>> orq %rdi, %rcx
>> salq $48, %rsi
>> orq %rsi, %rcx
>> +#ifndef AS_STRCASECMP
>> bsfq %rcx, %rcx
>> movzbl (%rax, %rcx), %eax
>> movzbl (%rdx, %rcx), %edx
>> subl %edx, %eax
>> ret
>> +#else
>> + movq %rax, %rdi
>> + movq %rdx, %rsi
>> + movq %rcx, %rax
>> + jmp L(return)
>> +#endif
>>
>> ALIGN (4)
>> L(loop_cross_page):
>> @@ -185,11 +289,19 @@ L(loop_cross_page):
>> shrq %cl, %rdi
>> test %rdi, %rdi
>> je L(back_to_loop)
>> +#ifndef AS_STRCASECMP
>> bsfq %rdi, %rcx
>> movzbl (%rax, %rcx), %eax
>> movzbl (%rdx, %rcx), %edx
>> subl %edx, %eax
>> ret
>> +#else
>> + movq %rdi, %r10
>> + movq %rax, %rdi
>> + movq %rdx, %rsi
>> + movq %r10, %rax
>> + jmp L(caseloop3)
>> +#endif
>>
>> ALIGN (4)
>> L(cross_page_loop):
>> @@ -201,6 +313,10 @@ L(cross_page_loop):
>> L(cross_page):
>> movzbl (%rdi, %rdx), %eax
>> movzbl (%rsi, %rdx), %ecx
>> +#ifdef AS_STRCASECMP
>> + movl (%r11, %rax, 4), %eax
>> + movl (%r11, %rcx, 4), %ecx
>> +#endif
>> testb %al, %al
>> jne L(cross_page_loop)
>> xorl %eax, %eax
>> @@ -208,3 +324,4 @@ L(different):
>> subl %ecx, %eax
>> ret
>> END (__strcmp_sse2_unaligned)
>> +#endif
>> diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
>> index c5dcd1a..818aa31 100644
>> --- a/sysdeps/x86_64/multiarch/strcmp.S
>> +++ b/sysdeps/x86_64/multiarch/strcmp.S
>> @@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
>> jne 1f
>> call __init_cpu_features
>> 1:
>> + leaq __strcasecmp_sse2_unaligned(%rip), %rax
>> + testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
>> + jnz 3f
>> +
>> # ifdef HAVE_AVX_SUPPORT
>> leaq __strcasecmp_avx(%rip), %rax
>> testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
>> jnz 3f
>> # endif
>> - testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
>> - jnz 2f
>> - leaq __strcasecmp_sse42(%rip), %rax
>> - testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
>> - jnz 3f
>> 2: leaq __strcasecmp_ssse3(%rip), %rax
>> testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
>> jnz 3f
>> --
>> 1.8.3.2
>
> --
>
> Too much radiation coming from the soil.