This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]

IA-64 strncpy


Hi!

Before Uli fixed strnlen, I started hacking on IA-64 strncpy because strncpy
was calling broken strnlen and I thought I might get some better results
from actually doing it all in strncpy.S.
The result is attached.
In the tarball there are results on IA-64 for the broken strncpy using
strnlen on top of memchr, the attached strncpy.S, strncpy.S using Uli's
current strnlen.c and strncpy.S using my strnlen.c.
The attached strncpy.S beats them all for small strings (or sizes) or for
aligned strings, unfortunately for some strange reason strnlen+memcpy+memset
is faster for long unaligned strings (unaligned is ((dst^src) & 0x7) != 0).
Can any IA-64 assembly hacker look into it?
As the code is not too different from strcpy.S, I'd guess strcpy.S might
have similar performance problems too.
Worst case strncpy.S could use the attached code unless it detects early
long unaligned strings, in which case it could call strnlen/memcpy/memset,
dunno.

	Jakub
/* Optimized version of the standard strncpy() function.
   This file is part of the GNU C Library.
   Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   Contributed by Dan Pop <Dan.Pop@cern.ch>
	      and Jakub Jelinek <jakub@redhat.com>.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, write to the Free
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307 USA.  */

/* Return: dest

   Inputs:
	in0:    dest
	in1:    src
	in2:	len

   In this form, it assumes little endian mode.
 */

#include <sysdep.h>
#undef ret

#define saved_lc	r15
#define saved_pr	r16
#define thresh		r17
#define dest		r18
#define dest2		r19
#define src		r20
#define len		r21
#define asrc		r22
#define tmp		r23
#define pos		r24
#define w0		r25
#define w1		r26
#define c		r27
#define sh2		r28
#define	sh1		r29
#define loopcnt		r30
#define	value		r31

ENTRY(strncpy)
	.prologue
	alloc 	r2 = ar.pfs, 3, 0, 29, 32

#define MEMLAT 2
	.rotr	r[MEMLAT + 2]
	.rotp	p[MEMLAT + 1]

	mov	ret0 = in0		// return value = dest
	.save pr, saved_pr
	mov	saved_pr = pr           // save the predicate registers
	.save ar.lc, saved_lc
	mov 	saved_lc = ar.lc	// save the loop counter
	.body
	cmp.geu p6, p5 = 24, in2
(p6)	br.cond.spnt .short_len
	sub	tmp = r0, in0 ;;	// tmp = -dest
	mov	len = in2		// len
	mov 	dest = in0		// dest
	mov 	src = in1		// src
	and	tmp = 7, tmp ;;		// loopcnt = -dest % 8
	cmp.eq	p6, p7 = tmp, r0
	adds	loopcnt = -1, tmp	// --loopcnt
(p6)	br.cond.sptk .dest_aligned ;;
	sub	len = len, tmp		// len -= -dest % 8
	mov	ar.lc = loopcnt
.l1:					// copy -dest % 8 bytes
(p5)	ld1	c = [src], 1		// c = *src++
	;;
	st1	[dest] = c, 1		// *dest++ = c
	cmp.ne	p5, p7 = c, r0
	br.cloop.dptk .l1 ;;
(p7)	br.cond.dpnt	.found0_align

.dest_aligned:				// p7 should be cleared here
	shr.u	c = len, 3		// c = len / 8
	and	sh1 = 7, src 		// sh1 = src % 8
	and	asrc = -8, src ;;	// asrc = src & -OPSIZ  -- align src
	adds	c = (MEMLAT-1), c	// c = (len / 8) + MEMLAT - 1
	sub	thresh = 8, sh1
	mov	pr.rot = 1 << 16	// set rotating predicates
	shl	sh1 = sh1, 3 ;;		// sh1 = 8 * (src % 8)
	mov	ar.lc = c		// "infinite" loop
	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
(p6)    br.cond.sptk .src_aligned
	adds	c = -(MEMLAT-1), c ;;	// c = (len / 8)
	ld8	r[1] = [asrc],8
	mov	ar.lc = c ;;

	.align	32
.l2:
(p6)	st8	[dest] = value, 8	// store val to dest
	ld8.s	r[0] = [asrc], 8
	shr.u	value = r[1], sh1 ;; 	// value = w0 >> sh1
	czx1.r	pos = value ;;		// do we have an "early" zero
	cmp.lt	p7, p0 = pos, thresh	// in w0 >> sh1?
	adds	len = -8, len		// len -= 8
(p7)	br.cond.dpnt .nonalign_found0
	chk.s	r[0], .recovery2	// it is safe to do that only
.back2:					// after the previous test
	shl	tmp = r[0], sh2  	// tmp = w1 << sh2
	;;
	or	value = value, tmp ;;	// value |= tmp
	czx1.r	pos = value ;;
	cmp.ne	p7, p6 = 8, pos
(p7)	br.cond.dpnt .nonalign_found0
	br.ctop.dptk    .l2 ;;
	adds	len = 8, len
	br.cond.sptk	.not_found0 ;;
.nonalign_found0:
	cmp.gtu	p6, p0 = -8, len
(p6)	br.cond.dptk .found0
	adds	len = 8, len
	br.cond.sptk	.not_found0 ;;

	.align	32
.src_aligned:
.l3:
(p[0])		ld8.s	r[0] = [src], 8
(p[MEMLAT])	chk.s	r[MEMLAT], .recovery3
.back3:
(p[MEMLAT])	mov	value = r[MEMLAT]
(p[MEMLAT])	czx1.r	pos = r[MEMLAT] ;;
(p[MEMLAT])	cmp.ne	p7, p0 = 8, pos
(p[MEMLAT])	adds	len = -8, len	// len -= 8
(p7)		br.cond.dpnt .found0
(p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
		br.ctop.dptk .l3 ;;

	chk.s	r[MEMLAT-1], .recovery4
.back4:
	mov	value = r[MEMLAT-1]

.not_found0:
	cmp.eq	p5, p6 = len, r0
	adds	len = -1, len
(p5)	br.cond.dptk	.restore_and_exit ;;
	mov	ar.lc = len
.l4:
(p6)	extr.u	c = value, 0, 8		// c = value & 0xff
(p6)	shr.u	value = value, 8 ;;
	st1	[dest] = c, 1
	cmp.ne	p6, p0 = c, r0
	br.cloop.dptk	.l4
	br.cond.sptk	.restore_and_exit

.found0_align:
	mov	pos = 0
	adds	len = -8, len
	mov	value = 0 ;;
.found0:
	shl	tmp = pos, 3
	shr.u	loopcnt = len, 4	// loopcnt = len / 16
	mov	c = -1 ;;
	cmp.eq	p6, p0 = loopcnt, r0
	adds	loopcnt = -1, loopcnt
	shl	c = c, tmp ;;
	and	len = 0xf, len
	andcm	value = value, c
	mov	ar.lc = loopcnt ;;
	cmp.le	p7, p0 = 8, len
	adds	dest2 = 16, dest
	st8	[dest] = value, 8
	and	len = 0x7, len
(p6)	br.cond.dpnt	.l6 ;;
.l5:
	st8	[dest] = r0, 16
	st8	[dest2] = r0, 16
	br.cloop.dptk	.l5 ;;
.l6:
(p7)	st8	[dest] = r0, 8
	cmp.eq	p5, p0 = len, r0
	adds	len = -1, len
(p5)	br.cond.dptk .restore_and_exit ;;
	mov	ar.lc = len ;;
.l7:
	st1	[dest] = r0, 1
	br.cloop.dptk	.l7 ;;
.restore_and_exit:
	mov 	ar.lc = saved_lc	// restore the loop counter
	mov	pr = saved_pr, -1	// restore the predicate registers
	br.ret.sptk.many b0

.short_len:
	cmp.eq	p5, p0 = in2, r0
	adds	loopcnt = -1, in2
(p5)	br.cond.spnt .restore_and_exit ;;
	mov	ar.lc = loopcnt		// p6 should be set when we get here
.l8:
(p6)	ld1	c = [in1], 1		// c = *src++
	;;
	st1	[in0] = c, 1		// *dest++ = c
(p6)	cmp.ne	p6, p0 = c, r0
	br.cloop.dptk .l8
	mov 	ar.lc = saved_lc	// restore the loop counter
	mov	pr = saved_pr, -1	// restore the predicate registers
	br.ret.sptk.many b0
.recovery2:
	add	tmp = -8, asrc ;;
	ld8	r[0] = [tmp]
	br.cond.sptk .back2
.recovery3:
	add	tmp = -(MEMLAT + 1) * 8, src ;;
	ld8	r[MEMLAT] = [tmp]
	br.cond.sptk .back3
.recovery4:
	add	tmp = -(MEMLAT + 1) * 8, src ;;
	ld8	r[MEMLAT] = [tmp]
	br.cond.sptk .back4
END(strncpy)

strncpy.tar.bz2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]