This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp


ping
On Mon, Sep 16, 2013 at 02:32:34PM +0200, OndÅej BÃlka wrote:
> On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> > Hi,
> > I tried to gather data also for strcasecmp/strncasecmp and I got
> > that they are used rarely on my system.
> > 
> Thanks to Andreas I have a implementation ready.
> 
> It works by first finding different characters with strcmp code, then
> checking if their case differ. As it is likely that these characters
> were different performance should be similar to strcmp one. I checked
> this property on my computer with following code and number of case 
> comparisons needed is mostly 1 in my test:
> 
> #include <stdio.h>
> int strcasecmp(unsigned char *x,unsigned char *y)
> {
>  int casecmp=0;
>  int i=0;
>  while(1) {
>    if (x[i]!=y[i])
>      if (tolower(x[i])==tolower(y[i]))
>        casecmp++;
>      else
>        {
> 	 fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
> 	 return tolower(x[i])-tolower(y[i]);
>        }
>    if (!x[i]) 
>      {
>        fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
>        return 0;
>      }
>    i++;
>  }
>  return 0;
> }
> 
> Downsite of this implementation is that checking aaaa vs AAAA will be
> slower, as this looks as unlikely case we could make this tradeoff.
> 
> I added it in generic way as I plan to add also ssse3 loop version which
> will come in separate patch.
> 
> 	* sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
> 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> 	Add strcasecmp_l-sse2-unaligned.
> 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
> 	strcasecmp_sse2_unaligned.
> 	* sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
> 	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
> 	implementation.
> 	* sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.
> 
> ---
>  sysdeps/x86_64/locale-defines.sym                  |   1 +
>  sysdeps/x86_64/multiarch/Makefile                  |   1 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c         |   2 +
>  .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S |   2 +
>  sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S   | 117 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strcmp.S                  |   9 +-
>  6 files changed, 127 insertions(+), 5 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> 
> diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
> index aebff9a..804debb 100644
> --- a/sysdeps/x86_64/locale-defines.sym
> +++ b/sysdeps/x86_64/locale-defines.sym
> @@ -8,4 +8,5 @@ LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
>  LC_CTYPE
>  _NL_CTYPE_NONASCII_CASE
>  LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
> +LOCALE_TOLOWER			offsetof (struct __locale_struct, __ctype_tolower)
>  SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..551923c 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>  		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>  		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> +		   strcasecmp_l-sse2-unaligned \
>  		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>  		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
>  		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 1a65ac0..40f8895 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			      __strcasecmp_avx)
>  	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
>  			      __strcasecmp_sse42)
> +	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
> +			      __strcasecmp_sse2_unaligned)
>  	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
>  			      __strcasecmp_ssse3)
>  	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> new file mode 100644
> index 0000000..62ce37e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
> @@ -0,0 +1,2 @@
> +#define AS_STRCASECMP
> +#include "strcmp-sse2-unaligned.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> index eed8432..c93d2f5 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
> @@ -16,10 +16,33 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> +#ifndef NOT_IN_libc
> +
>  #include "sysdep.h"
>  #define ALIGN(x)	.p2align x
>  
> +#ifdef AS_STRCASECMP
> +# include "locale-defines.h"
> +
> +#define __strcasecmp_sse2_unaligned strcasecmp_new
> +
> +# define  __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
> +ENTRY (__strcasecmp_sse2_unaligned)
> +	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
> +	mov	%fs:(%rax), %rdx
> +        // XXX 5 byte should be before the function
> +        /* 5-byte NOP.  */
> +        .byte   0x0f,0x1f,0x44,0x00,0x00
> +
> +END (__strcasecmp_sse2_unaligned)
> +
> +#endif
> +
>  ENTRY ( __strcmp_sse2_unaligned)
> +
> +#ifdef AS_STRCASECMP
> +	mov	LOCALE_TOLOWER(%rdx), %r11
> +#endif
>  	movl	%edi, %eax
>  	xorl	%edx, %edx
>  	pxor	%xmm7, %xmm7
> @@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
>  	pmovmskb	%xmm0, %eax
>  	testq	%rax, %rax
>  	je	L(next_48_bytes)
> +#ifndef AS_STRCASECMP
>  L(return):
>  	bsfq	%rax, %rdx
>  	movzbl	(%rdi, %rdx), %eax
>  	movzbl	(%rsi, %rdx), %edx
>  	subl	%edx, %eax
>  	ret
> +#else
> +	jmp	L(caseloop1)
> +#endif
>  
>  	ALIGN (4)
>  L(next_48_bytes):
> @@ -85,6 +112,76 @@ L(main_loop_header):
>  	movq	%rcx, %rsi
>  	jmp	L(loop_start)
>  
> +#ifdef AS_STRCASECMP
> +L(caseloop1):
> +	bsfq	%rax, %rdx
> +	leaq	-1(%rax), %rcx
> +	andq	%rax, %rcx
> +	movzbl	(%rdi, %rdx), %eax
> +	movzbl	(%rsi, %rdx), %edx
> +	movl	(%r11, %rax, 4), %eax
> +	movl	(%r11, %rdx, 4), %edx
> +	testl	%eax, %eax
> +	je	L(zero1)
> +	cmpl	%edx, %eax
> +	je	L(casecnt1)
> +L(zero1):
> +	subl	%edx, %eax
> +	ret
> +L(casecnt1):
> +	testq	%rcx, %rcx
> +	je	L(next_48_bytes)
> +	movq	%rcx, %rax
> +	jmp	L(caseloop1)
> +
> +L(return):
> +L(caseloop2):
> +	bsfq	%rax, %rdx
> +	leaq	-1(%rax), %rcx
> +	andq	%rax, %rcx
> +	movzbl	(%rdi, %rdx), %eax
> +	movzbl	(%rsi, %rdx), %edx
> +	movl	(%r11, %rax, 4), %eax
> +	movl	(%r11, %rdx, 4), %edx
> +	testl	%eax, %eax
> +	je	L(zero2)
> +	cmpl	%edx, %eax
> +	je	L(casecnt2)
> +L(zero2):
> +	subl	%edx, %eax
> +	ret
> +L(casecnt2):
> +	testq	%rcx, %rcx
> +	je	L(main_loop_header)
> +	movq	%rcx, %rax
> +	jmp	L(caseloop2)
> +
> +L(caseloop3):
> +	bsfq	%rax, %rdx
> +	leaq	-1(%rax), %r10
> +	andq	%rax, %r10
> +	movzbl	(%rdi, %rdx), %eax
> +	movzbl	(%rsi, %rdx), %edx
> +	movl	(%r11, %rax, 4), %eax
> +	movl	(%r11, %rdx, 4), %edx
> +	testl	%eax, %eax
> +	je	L(zero3)
> +	cmpl	%edx, %eax
> +	je	L(casecnt3)
> +L(zero3):
> +	subl	%edx, %eax
> +	ret
> +L(casecnt3):
> +	movq	%rdi, %rax
> +	movq	%rsi, %rdx
> +	testq	%r10, %r10
> +	je	L(back_to_loop)
> +	movq	%r10, %rax
> +	jmp	L(caseloop3)
> +
> +#endif
> +
> +
>  	ALIGN	(4)
>  L(loop):
>  	addq	$64, %rax
> @@ -135,11 +232,18 @@ L(back_to_loop):
>  	orq	%rdi, %rcx
>  	salq	$48, %rsi
>  	orq	%rsi, %rcx
> +#ifndef AS_STRCASECMP
>  	bsfq	%rcx, %rcx
>  	movzbl	(%rax, %rcx), %eax
>  	movzbl	(%rdx, %rcx), %edx
>  	subl	%edx, %eax
>  	ret
> +#else
> +	movq	%rax, %rdi
> +	movq	%rdx, %rsi
> +	movq	%rcx, %rax
> +	jmp	L(return)
> +#endif
>  
>  	ALIGN (4)
>  L(loop_cross_page):
> @@ -185,11 +289,19 @@ L(loop_cross_page):
>  	shrq	%cl, %rdi
>  	test	%rdi, %rdi
>  	je	L(back_to_loop)
> +#ifndef AS_STRCASECMP
>  	bsfq	%rdi, %rcx
>  	movzbl	(%rax, %rcx), %eax
>  	movzbl	(%rdx, %rcx), %edx
>  	subl	%edx, %eax
>  	ret
> +#else
> +	movq	%rdi, %r10
> +	movq	%rax, %rdi
> +	movq	%rdx, %rsi
> +	movq	%r10, %rax
> +	jmp	L(caseloop3)
> +#endif
>  
>  	ALIGN (4)
>  L(cross_page_loop):
> @@ -201,6 +313,10 @@ L(cross_page_loop):
>  L(cross_page):
>  	movzbl	(%rdi, %rdx), %eax
>  	movzbl	(%rsi, %rdx), %ecx
> +#ifdef AS_STRCASECMP
> +	movl	(%r11, %rax, 4), %eax
> +	movl	(%r11, %rcx, 4), %ecx
> +#endif
>  	testb	%al, %al
>  	jne	L(cross_page_loop)
>  	xorl	%eax, %eax
> @@ -208,3 +324,4 @@ L(different):
>  	subl	%ecx, %eax
>  	ret
>  END (__strcmp_sse2_unaligned)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
> index c5dcd1a..818aa31 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.S
> +++ b/sysdeps/x86_64/multiarch/strcmp.S
> @@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
>  	jne	1f
>  	call	__init_cpu_features
>  1:
> +	leaq	__strcasecmp_sse2_unaligned(%rip), %rax
> +	testl   $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
> +	jnz     3f
> +
>  #  ifdef HAVE_AVX_SUPPORT
>  	leaq	__strcasecmp_avx(%rip), %rax
>  	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
>  	jnz	3f
>  #  endif
> -	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> -	jnz	2f
> -	leaq	__strcasecmp_sse42(%rip), %rax
> -	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> -	jnz	3f
>  2:	leaq	__strcasecmp_ssse3(%rip), %rax
>  	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
>  	jnz	3f
> -- 
> 1.8.3.2

-- 

Too much radiation coming from the soil.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]