This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

SH: optimized memcpy/memset


Hi,

This patch gives an optimized memcpy/memset for SH. The original patch
was written by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>.

	kaz
--
2002-07-05 Kaz Kojima  <kkojima@rr.iij4u.or.jp>

	* sysdeps/sh/memcpy.S: Optimize. Based on a patch by Toshiyasu
	Morita <toshiyasu.morita@hsa.hitachi.com>.
	* sysdeps/sh/memcpy.S: Likewise.

Index: memcpy.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memcpy.S,v
retrieving revision 1.2
diff -u -r1.2 memcpy.S
--- memcpy.S	6 Jul 2001 04:56:03 -0000	1.2
+++ memcpy.S	5 Jul 2002 08:07:17 -0000
@@ -1,5 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
+   Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+   Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -19,213 +21,179 @@
 #include <sysdep.h>
 #include <endian.h>
 
-/*
- * void *memcpy(void *dst, const void *src, size_t n);
- * No overlap between the memory of DST and of SRC are assumed.
- */
+/* void *memcpy(void *dst, const void *src, size_t n);
+    No overlap between the memory of DST and of SRC are assumed.  */
 
 ENTRY(memcpy)
-	tst	r6,r6
-	bt/s	1f
-	mov	r4,r0
-	mov	#12,r1
-	cmp/gt	r6,r1
-	bf	2f
-0:
-	mov.b	@r5+,r1
-	dt	r6
+	mov	r4,r3		/* Save destination.  */
+
+	/* If less than 11 bytes, just do a byte copy.  */
+	mov	#11,r0
+	cmp/gt	r6,r0
+	bt	L_byteloop_init
+
+	/* Check if we need to word-align source.  */
+	mov	r5,r0
+	tst	#1,r0
+	bt	L_wordalign
+
+	mov.b	@r0+,r1		/* Copy one byte.  */
+	add	#-1,r6
 	mov.b	r1,@r4
-	bf/s	0b
 	add	#1,r4
-1:
-	rts
-	nop
-2:	
-	mov.l	r8,@-r15
-	mov.l	r9,@-r15
-	mov	r6,r2
-	mov.l	r10,@-r15
-	mov.l	r11,@-r15
-	mov.l	r14,@-r15
-	mov	r4,r11
-	mov	r15,r14
-	mov	r5,r0
-	and	#1,r0
-	tst	r0,r0
-	bt/s	.L42
-	mov	r5,r0
-	mov.b	@r5+,r1
-	add	#-1,r2
+
+	.balignw 4,0x0009
+L_wordalign:
+	/* Check if we need to longword-align source.  */
+	tst	#2,r0
+	bt	L_copy
+
+	mov.w	@r0+,r1		/* Copy one word.  */
+	add	#-2,r6
+#if __BYTE_ORDER == __BIG_ENDIAN
 	add	#1,r4
-	mov.b	r1,@r11
-	mov	r5,r0
-.L42:
-	and	#2,r0
-	tst	r0,r0
-	bt/s	.L43
-	mov	r4,r0
-	mov.b	@r5+,r1
 	mov.b	r1,@r4
-	mov.b	@r5+,r1
+	shlr8	r1
+	mov.b	r1,@-r4
+	add	#2,r4
+#else
+	mov.b	r1,@r4
 	add	#1,r4
-	add	#-2,r2
+	shlr8	r1
 	mov.b	r1,@r4
 	add	#1,r4
+#endif
+L_copy:
+	mov	r0,r5
+
+	/* Calculate the correct routine to handle the destination
+	   alignment and simultaneously calculate the loop counts for
+	   both the 2 word copy loop and byte copy loop.  */
+	mova	L_jumptable,r0
+	mov	r0,r1
 	mov	r4,r0
-.L43:
-	and	#1,r0
-	tst	r0,r0
-	bf/s	.L38
-	mov	r4,r0
-	and	#2,r0
-	tst	r0,r0
-	bf/s	.L7
-	mov	r2,r0
-	shlr2	r0
+	mov	r6,r7
 	and	#3,r0
-	cmp/eq	#2,r0
-	bt/s	.L10
-	mov	#2,r1
-	cmp/gt	r1,r0
-	bt/s	.L14
-	cmp/eq	#3,r0
-	cmp/eq	#1,r0
-	bt/s	.L11
-	mov	r0,r1
-	bra	.L44
-	shll2	r1
-	.align 5
-.L14:
-	bf	.L8
-	mov.l	@(8,r5),r1
-	mov.l	r1,@(8,r4)
-.L10:
-	mov.l	@(4,r5),r1
-	mov.l	r1,@(4,r4)
-.L11:
-	mov.l	@r5,r1
-	mov.l	r1,@r4
-.L8:
-	mov	r0,r1
-	shll2	r1
-.L44:
-	add	r1,r4
-	add	r1,r5
-	mov	r2,r0
-	mov	#-4,r1
-	shad	r1,r0
-	mov	#3,r6
-	bra	.L37
-	and	r2,r6
-	.align 5
-.L18:
+	shlr2	r7
+	shll	r0
+	shlr	r7
+	mov.w	@(r0,r1),r2
+	mov	#7,r0
+	braf	r2
+	and	r0,r6
+L_base:
+
+	.balign	4
+L_jumptable:
+	.word	L_copydest0 - L_base
+	.word	L_copydest1_or_3 - L_base
+	.word	L_copydest2 - L_base
+	.word	L_copydest1_or_3 - L_base
+
+	.balign	4
+	/* Copy routine for (dest mod 4) == 1 or == 3.  */
+L_copydest1_or_3:
+	add	#-1,r4
+	.balignw 4,0x0009
+L_copydest1_or_3_loop:
+	mov.l	@r5+,r0		/* Read first longword.  */
+	dt	r7
+	mov.l	@r5+,r1		/* Read second longword.  */
+#if __BYTE_ORDER == __BIG_ENDIAN
+	/* Write first longword as byte, word, byte.  */
+	mov.b	r0,@(4,r4)
+	shlr8	r0
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.b	r0,@(1,r4)
+	mov	r1,r0
+	/* Write second longword as byte, word, byte.  */
+	mov.b	r0,@(8,r4)
+	shlr8	r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.b	r0,@(5,r4)
+#else
+	/* Write first longword as byte, word, byte.  */
+	mov.b	r0,@(1,r4)
+	shlr8	r0
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.b	r0,@(4,r4)
+	mov	r1,r0
+	/* Write second longword as byte, word, byte.  */
+	mov.b	r0,@(5,r4)
+	shlr8	r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.b	r0,@(8,r4)
+#endif
+	bf/s	L_copydest1_or_3_loop
+	add	#8,r4
+
+	bra	L_byteloop_init
+	add	#1,r4
+
+	.balign 4
+	/* Copy routine for (dest mod 4) == 2.  */
+L_copydest2:
+L_copydest2_loop:
+	mov.l	@r5+,r0
+	dt	r7
 	mov.l	@r5+,r1
-	mov.l	@r5+,r2
-	mov.l	@r5+,r3
-	mov.l	@r5+,r7
-	mov.l	r1,@r4
-	mov.l	r2,@(4,r4)
-	mov.l	r3,@(8,r4)
-	mov.l	r7,@(12,r4)
-	add	#16,r4
-.L37:
-	cmp/pl	r0
-	bt/s	.L18
-	add	#-1,r0
-	mov	r6,r2
-.L38:
-	bra	.L40
-	mov	r2,r0
-	.align 5
-.L7:
-	shar	r0
-	and	#3,r0
-	cmp/eq	#2,r0
-	bt/s	.L23
-	mov	#2,r1
-	cmp/gt	r1,r0
-	bt/s	.L27
-	cmp/eq	#3,r0
-	cmp/eq	#1,r0
-	bt/s	.L24
-	mov	r0,r1
-	bra	.L45
-	add	r0,r1
-	.align 5
-.L27:
-	bf	.L21
-	add	#4,r5
-	mov.w	@r5,r1
-	add	#4,r4
-	mov.w	r1,@r4
-	add	#-4,r5
-	add	#-4,r4
-.L23:
-	add	#2,r5
-	mov.w	@r5,r1
-	add	#2,r4
-	mov.w	r1,@r4
-	add	#-2,r5
-	add	#-2,r4
-.L24:
-	mov.w	@r5,r1
-	mov.w	r1,@r4
-.L21:
-	mov	r0,r1
-	add	r0,r1
-.L45:
-	add	r1,r4
-	add	r1,r5
-	mov	r2,r0
-	mov	#-3,r1
-	shad	r1,r0
-	mov	#1,r10
-	mov	r0,r1
-	and	r2,r10
-	cmp/pl	r1
-	bf/s	.L29
-	add	#-1,r0
-	mov	r4,r9
-	mov	r4,r8
-	add	#4,r9
-	mov	r4,r6
-	add	#6,r8
-	add	#2,r6
-.L31:
-	mov.w	@r5+,r1
-	mov.w	@r5+,r2
-	mov.w	@r5+,r3
-	mov.w	@r5+,r7
-	mov.w	r1,@r4
-	mov.w	r2,@r6
+#if __BYTE_ORDER == __BIG_ENDIAN
+	mov.w	r0,@(2,r4)
+	shlr16	r0
+	mov.w	r0,@r4
+	mov	r1,r0
+	mov.w	r0,@(6,r4)
+	shlr16	r0
+	mov.w	r0,@(4,r4)
+#else
+	mov.w	r0,@r4
+	shlr16	r0
+	mov.w	r0,@(2,r4)
+	mov	r1,r0
+	mov.w	r0,@(4,r4)
+	shlr16	r0
+	mov.w	r0,@(6,r4)
+#endif
+	bf/s	L_copydest2_loop
 	add	#8,r4
-	mov	r0,r1
-	add	#8,r6
-	mov.w	r3,@r9
-	add	#-1,r0
-	add	#8,r9
-	mov.w	r7,@r8
-	cmp/pl	r1
-	bt/s	.L31
-	add	#8,r8
-.L29:
-	mov	r10,r2
-	mov	r2,r0
-.L40:
-	cmp/pl	r0
-	bf	.L34
-.L35:
-	mov.b	@r5+,r1
-	dt	r2
-	mov.b	r1,@r4
-	bf/s	.L35
+
+	bra	L_byteloop_init
+	nop
+
+	.balign 4
+	/* Copy routine for (dest mod 4) == 0.  */
+L_copydest0:
+	add	#-8,r4
+	.balignw 4,0x0009
+L_copydest0_loop:
+	mov.l	@r5+,r0
+	dt	r7
+	mov.l	@r5+,r1
+	add	#8,r4
+	mov.l	r0,@r4
+	bf/s	L_copydest0_loop
+	mov.l	r1,@(4,r4)
+
+	add	#8,r4		/* Fall through.  */
+
+L_byteloop_init:
+	tst	r6,r6
+	bt	L_exit
+
+	.balignw 4,0x0009
+	/* Copy remaining bytes.  */
+L_byteloop:
+	mov.b	@r5+,r0
+	dt	r6
+	mov.b	r0,@r4
+	bf/s	L_byteloop
 	add	#1,r4
-.L34:
-	mov	r11,r0
-	mov	r14,r15
-	mov.l	@r15+,r14
-	mov.l	@r15+,r11
-	mov.l	@r15+,r10
-	mov.l	@r15+,r9
-	rts	
-	mov.l	@r15+,r8
+
+L_exit:
+	rts
+	mov	r3,r0		/* Return destination.  */
+END(memcpy)
Index: memset.S
===================================================================
RCS file: /cvs/glibc/libc/sysdeps/sh/memset.S,v
retrieving revision 1.2
diff -u -r1.2 memset.S
--- memset.S	6 Jul 2001 04:56:03 -0000	1.2
+++ memset.S	5 Jul 2002 08:07:17 -0000
@@ -1,6 +1,7 @@
-/* Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+/* Copyright (C) 1999, 2000, 2002 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
+   Optimized by Toshiyasu Morita <toshiyasu.morita@hsa.hitachi.com>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -19,61 +20,68 @@
 
 #include <sysdep.h>
 
-/* void *memset (t, c, len)  */
+/* void *memset (t, c, len);  */
 
 ENTRY(memset)
-	tst	r6, r6
-	bt/s	end
-	mov	r4, r3
-	mov	#3, r0
-	cmp/hs	r6, r0
-	bt/s	2f
-	and	r4, r0
-	tst	r0, r0
-	bt/s	1f
-	add	r0, r6
-	add	#-1, r0
-	shll2	r0
-	braf	r0
-	add	#-4, r6
-
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-1:
-	extu.b	r5, r0
-	shll8	r5
-	or	r5, r0
-	extu.w	r0, r0
-	mov	r0, r5
-	swap.w	r5, r5
-	or	r0, r5
-	
-2:
-	add	#-4, r6
-	cmp/pz	r6
-	bf	afew
-	mov.l	r5, @r4
-	bra	2b
-	add	#4, r4
-
-afew:
-	mov	#-1, r0
-	sub	r6, r0
-	shll2	r0
-	braf	r0
-	nop
-
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-	mov.b	r5, @r4
-	add	#1, r4
-end:
+	mov	#12,r0
+	cmp/gt	r6,r0
+	bt.s	L_byte_loop_init
+	mov	r4,r7
+
+	swap.b	r5,r1
+	or	r1,r5
+	swap.w	r5,r1
+	or	r1,r5
+
+	mov	r4,r0
+	tst	#1,r0
+	bt	L_wordalign
+
+	mov.b	r5,@r4
+	add	#-1,r6	
+	add	#1,r4
+	mov	r4,r0
+
+	.balignw 4,0x0009
+L_wordalign:
+	tst	#2,r0
+	bt	L_word_loop_init
+
+	mov.w	r5,@r4
+	add	#-2,r6
+	add	#2,r4
+	mov	r4,r0
+
+	.balignw 4,0x0009
+L_word_loop_init:
+	mov	r6,r3
+	shlr2	r3
+	mov	#7,r0
+	shlr	r3
+	and	r0,r6
+
+	.balignw 4,0x0009
+L_2word_loop:
+	mov.l	r5,@r4
+	dt	r3
+	mov.l	r5,@(4,r4)
+	bf.s	L_2word_loop
+	add	#8,r4
+
+	.balignw 4,0x0009
+L_byte_loop_init:
+	tst	r6,r6
+	bt	L_byte_exit
+
+	.balignw 4,0x0009
+L_byte_loop:
+	mov.b	r5,@r4
+	dt	r6
+	bf.s	L_byte_loop
+	add	#1,r4
+
+	.balignw 4,0x0009
+L_byte_exit:
 	rts
-	mov	r3, r0
+	mov	r7,r0
 END(memset)


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]