This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp


I don't see any performance results.
Did I miss it?
And we should check also for Haswell and Silvermont to make sure it is fine.

--
Liubov
Intel Corporation

On Mon, Oct 7, 2013 at 5:06 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> ping
> On Mon, Sep 16, 2013 at 02:32:34PM +0200, OndÅej BÃlka wrote:
>> On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
>> > Hi,
>> > I tried to gather data also for strcasecmp/strncasecmp and I got
>> > that they are used rarely on my system.
>> >
>> Thanks to Andreas I have a implementation ready.
>>
>> It works by first finding different characters with strcmp code, then
>> checking if their case differ. As it is likely that these characters
>> were different performance should be similar to strcmp one. I checked
>> this property on my computer with following code and number of case
>> comparisons needed is mostly 1 in my test:
>>
>> #include <stdio.h>
>> int strcasecmp(unsigned char *x,unsigned char *y)
>> {
>>  int casecmp=0;
>>  int i=0;
>>  while(1) {
>>    if (x[i]!=y[i])
>>      if (tolower(x[i])==tolower(y[i]))
>>        casecmp++;
>>      else
>>        {
>>        fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
>>        return tolower(x[i])-tolower(y[i]);
>>        }
>>    if (!x[i])
>>      {
>>        fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
>>        return 0;
>>      }
>>    i++;
>>  }
>>  return 0;
>> }
>>
>> Downsite of this implementation is that checking aaaa vs AAAA will be
>> slower, as this looks as unlikely case we could make this tradeoff.
>>
>> I added it in generic way as I plan to add also ssse3 loop version which
>> will come in separate patch.
>>
>>       * sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
>>       * sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
>>       Add strcasecmp_l-sse2-unaligned.
>>       * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
>>       strcasecmp_sse2_unaligned.
>>       * sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
>>       * sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
>>       implementation.
>>       * sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.
>>
>> ---
>>  sysdeps/x86_64/locale-defines.sym                  |   1 +
>>  sysdeps/x86_64/multiarch/Makefile                  |   1 +
>>  sysdeps/x86_64/multiarch/ifunc-impl-list.c         |   2 +
>>  .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S |   2 +
>>  sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S   | 117 +++++++++++++++++++++
>>  sysdeps/x86_64/multiarch/strcmp.S                  |   9 +-
>>  6 files changed, 127 insertions(+), 5 deletions(-)
>>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>>
>> diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
>> index aebff9a..804debb 100644
>> --- a/sysdeps/x86_64/locale-defines.sym
>> +++ b/sysdeps/x86_64/locale-defines.sym
>> @@ -8,4 +8,5 @@ LOCALE_T___LOCALES            offsetof (struct __locale_struct, __locales)
>>  LC_CTYPE
>>  _NL_CTYPE_NONASCII_CASE
>>  LOCALE_DATA_VALUES           offsetof (struct __locale_data, values)
>> +LOCALE_TOLOWER                       offsetof (struct __locale_struct, __ctype_tolower)
>>  SIZEOF_VALUES                        sizeof (((struct __locale_data *) 0)->values[0])
>> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> index 5ab950a..551923c 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>>                  memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>>                  memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>>                  strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>> +                strcasecmp_l-sse2-unaligned \
>>                  strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>>                  strcpy-sse2-unaligned strncpy-sse2-unaligned \
>>                  stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 1a65ac0..40f8895 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>>                             __strcasecmp_avx)
>>             IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
>>                             __strcasecmp_sse42)
>> +           IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
>> +                           __strcasecmp_sse2_unaligned)
>>             IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
>>                             __strcasecmp_ssse3)
>>             IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>> new file mode 100644
>> index 0000000..62ce37e
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
>> @@ -0,0 +1,2 @@
>> +#define AS_STRCASECMP
>> +#include "strcmp-sse2-unaligned.S"
>> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> index eed8432..c93d2f5 100644
>> --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
>> @@ -16,10 +16,33 @@
>>     License along with the GNU C Library; if not, see
>>     <http://www.gnu.org/licenses/>.  */
>>
>> +#ifndef NOT_IN_libc
>> +
>>  #include "sysdep.h"
>>  #define ALIGN(x)     .p2align x
>>
>> +#ifdef AS_STRCASECMP
>> +# include "locale-defines.h"
>> +
>> +#define __strcasecmp_sse2_unaligned strcasecmp_new
>> +
>> +# define  __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
>> +ENTRY (__strcasecmp_sse2_unaligned)
>> +     movq    __libc_tsd_LOCALE@gottpoff(%rip), %rax
>> +     mov     %fs:(%rax), %rdx
>> +        // XXX 5 byte should be before the function
>> +        /* 5-byte NOP.  */
>> +        .byte   0x0f,0x1f,0x44,0x00,0x00
>> +
>> +END (__strcasecmp_sse2_unaligned)
>> +
>> +#endif
>> +
>>  ENTRY ( __strcmp_sse2_unaligned)
>> +
>> +#ifdef AS_STRCASECMP
>> +     mov     LOCALE_TOLOWER(%rdx), %r11
>> +#endif
>>       movl    %edi, %eax
>>       xorl    %edx, %edx
>>       pxor    %xmm7, %xmm7
>> @@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
>>       pmovmskb        %xmm0, %eax
>>       testq   %rax, %rax
>>       je      L(next_48_bytes)
>> +#ifndef AS_STRCASECMP
>>  L(return):
>>       bsfq    %rax, %rdx
>>       movzbl  (%rdi, %rdx), %eax
>>       movzbl  (%rsi, %rdx), %edx
>>       subl    %edx, %eax
>>       ret
>> +#else
>> +     jmp     L(caseloop1)
>> +#endif
>>
>>       ALIGN (4)
>>  L(next_48_bytes):
>> @@ -85,6 +112,76 @@ L(main_loop_header):
>>       movq    %rcx, %rsi
>>       jmp     L(loop_start)
>>
>> +#ifdef AS_STRCASECMP
>> +L(caseloop1):
>> +     bsfq    %rax, %rdx
>> +     leaq    -1(%rax), %rcx
>> +     andq    %rax, %rcx
>> +     movzbl  (%rdi, %rdx), %eax
>> +     movzbl  (%rsi, %rdx), %edx
>> +     movl    (%r11, %rax, 4), %eax
>> +     movl    (%r11, %rdx, 4), %edx
>> +     testl   %eax, %eax
>> +     je      L(zero1)
>> +     cmpl    %edx, %eax
>> +     je      L(casecnt1)
>> +L(zero1):
>> +     subl    %edx, %eax
>> +     ret
>> +L(casecnt1):
>> +     testq   %rcx, %rcx
>> +     je      L(next_48_bytes)
>> +     movq    %rcx, %rax
>> +     jmp     L(caseloop1)
>> +
>> +L(return):
>> +L(caseloop2):
>> +     bsfq    %rax, %rdx
>> +     leaq    -1(%rax), %rcx
>> +     andq    %rax, %rcx
>> +     movzbl  (%rdi, %rdx), %eax
>> +     movzbl  (%rsi, %rdx), %edx
>> +     movl    (%r11, %rax, 4), %eax
>> +     movl    (%r11, %rdx, 4), %edx
>> +     testl   %eax, %eax
>> +     je      L(zero2)
>> +     cmpl    %edx, %eax
>> +     je      L(casecnt2)
>> +L(zero2):
>> +     subl    %edx, %eax
>> +     ret
>> +L(casecnt2):
>> +     testq   %rcx, %rcx
>> +     je      L(main_loop_header)
>> +     movq    %rcx, %rax
>> +     jmp     L(caseloop2)
>> +
>> +L(caseloop3):
>> +     bsfq    %rax, %rdx
>> +     leaq    -1(%rax), %r10
>> +     andq    %rax, %r10
>> +     movzbl  (%rdi, %rdx), %eax
>> +     movzbl  (%rsi, %rdx), %edx
>> +     movl    (%r11, %rax, 4), %eax
>> +     movl    (%r11, %rdx, 4), %edx
>> +     testl   %eax, %eax
>> +     je      L(zero3)
>> +     cmpl    %edx, %eax
>> +     je      L(casecnt3)
>> +L(zero3):
>> +     subl    %edx, %eax
>> +     ret
>> +L(casecnt3):
>> +     movq    %rdi, %rax
>> +     movq    %rsi, %rdx
>> +     testq   %r10, %r10
>> +     je      L(back_to_loop)
>> +     movq    %r10, %rax
>> +     jmp     L(caseloop3)
>> +
>> +#endif
>> +
>> +
>>       ALIGN   (4)
>>  L(loop):
>>       addq    $64, %rax
>> @@ -135,11 +232,18 @@ L(back_to_loop):
>>       orq     %rdi, %rcx
>>       salq    $48, %rsi
>>       orq     %rsi, %rcx
>> +#ifndef AS_STRCASECMP
>>       bsfq    %rcx, %rcx
>>       movzbl  (%rax, %rcx), %eax
>>       movzbl  (%rdx, %rcx), %edx
>>       subl    %edx, %eax
>>       ret
>> +#else
>> +     movq    %rax, %rdi
>> +     movq    %rdx, %rsi
>> +     movq    %rcx, %rax
>> +     jmp     L(return)
>> +#endif
>>
>>       ALIGN (4)
>>  L(loop_cross_page):
>> @@ -185,11 +289,19 @@ L(loop_cross_page):
>>       shrq    %cl, %rdi
>>       test    %rdi, %rdi
>>       je      L(back_to_loop)
>> +#ifndef AS_STRCASECMP
>>       bsfq    %rdi, %rcx
>>       movzbl  (%rax, %rcx), %eax
>>       movzbl  (%rdx, %rcx), %edx
>>       subl    %edx, %eax
>>       ret
>> +#else
>> +     movq    %rdi, %r10
>> +     movq    %rax, %rdi
>> +     movq    %rdx, %rsi
>> +     movq    %r10, %rax
>> +     jmp     L(caseloop3)
>> +#endif
>>
>>       ALIGN (4)
>>  L(cross_page_loop):
>> @@ -201,6 +313,10 @@ L(cross_page_loop):
>>  L(cross_page):
>>       movzbl  (%rdi, %rdx), %eax
>>       movzbl  (%rsi, %rdx), %ecx
>> +#ifdef AS_STRCASECMP
>> +     movl    (%r11, %rax, 4), %eax
>> +     movl    (%r11, %rcx, 4), %ecx
>> +#endif
>>       testb   %al, %al
>>       jne     L(cross_page_loop)
>>       xorl    %eax, %eax
>> @@ -208,3 +324,4 @@ L(different):
>>       subl    %ecx, %eax
>>       ret
>>  END (__strcmp_sse2_unaligned)
>> +#endif
>> diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
>> index c5dcd1a..818aa31 100644
>> --- a/sysdeps/x86_64/multiarch/strcmp.S
>> +++ b/sysdeps/x86_64/multiarch/strcmp.S
>> @@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
>>       jne     1f
>>       call    __init_cpu_features
>>  1:
>> +     leaq    __strcasecmp_sse2_unaligned(%rip), %rax
>> +     testl   $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
>> +     jnz     3f
>> +
>>  #  ifdef HAVE_AVX_SUPPORT
>>       leaq    __strcasecmp_avx(%rip), %rax
>>       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
>>       jnz     3f
>>  #  endif
>> -     testl   $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
>> -     jnz     2f
>> -     leaq    __strcasecmp_sse42(%rip), %rax
>> -     testl   $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
>> -     jnz     3f
>>  2:   leaq    __strcasecmp_ssse3(%rip), %rax
>>       testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
>>       jnz     3f
>> --
>> 1.8.3.2
>
> --
>
> Too much radiation coming from the soil.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]