This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch master updated. glibc-2.17-139-gd542f8e

From: mshawcroft at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 17 Jan 2013 10:56:57 -0000
Subject: GNU C Library master sources branch master updated. glibc-2.17-139-gd542f8e
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  d542f8ed21474035990eddbf9dfdf11660692335 (commit)
       via  38fecb39a0c3daecdc44973422894443b378c606 (commit)
       via  7ac4d962399145b6a913d9d39a38514a7ffc0e9f (commit)
       via  c2b6221ed3be421c2f7a8ad32e70ad501340b55b (commit)
       via  857c8d222813f26e5178d3de1f272d1c3d9e24ac (commit)
       via  58faa0874bc55f7520ed2255aaa7c5aa42e26292 (commit)
       via  14d941e4dce164097aa6165828d13441ae2a4dbe (commit)
       via  62216a0a1e79ea5ce17ad99ef9efbb4aa2afb8c0 (commit)
      from  881ebe897a8b79b243f0e3a3a8aab6eea0119693 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d542f8ed21474035990eddbf9dfdf11660692335

commit d542f8ed21474035990eddbf9dfdf11660692335
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:57:42 2013 +0000

    AArch64: Implement optimized strlen.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 965e4a1..c7487f5 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+        * sysdeps/aarch64/strlen.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
         * sysdeps/aarch64/strcmp.S: New file.
 
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
diff --git a/ports/sysdeps/aarch64/strlen.S b/ports/sysdeps/aarch64/strlen.S
new file mode 100644
index 0000000..ba05009
--- /dev/null
+++ b/ports/sysdeps/aarch64/strlen.S
@@ -0,0 +1,117 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+
+/* Locals and temporaries.  */
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define data2a		x4
+#define has_nul1	x5
+#define has_nul2	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define tmp4		x10
+#define zeroones	x11
+#define pos		x12
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+ENTRY_ALIGN (strlen, 6)
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	L(misaligned)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+L(loop):
+	ldp	data1, data2, [src], #16
+L(realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(loop)
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	sub	len, src, srcin
+	cbz	has_nul1, L(nul_in_data2)
+#ifdef __AARCH64EB__
+	mov	data2, data1
+#endif
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+L(nul_in_data2):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	RET
+
+L(misaligned):
+	cmp	tmp1, #8
+	neg	tmp1, tmp1
+	ldp	data1, data2, [src], #16
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	L(realigned)
+END (strlen)
+libc_hidden_builtin_def (strlen)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=38fecb39a0c3daecdc44973422894443b378c606

commit 38fecb39a0c3daecdc44973422894443b378c606
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:29:29 2013 +0000

    AArch64: Implement optimized strcmp

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 3f8821c..965e4a1 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+        * sysdeps/aarch64/strcmp.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
         * sysdeps/aarch64/bzero.S: New file.
 
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
diff --git a/ports/sysdeps/aarch64/strcmp.S b/ports/sysdeps/aarch64/strcmp.S
new file mode 100644
index 0000000..fa4705c
--- /dev/null
+++ b/ports/sysdeps/aarch64/strcmp.S
@@ -0,0 +1,155 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include <sysdep.h>
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+ENTRY_ALIGN(strcmp, 6)
+
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	RET
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* We can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(misaligned8)
+	sub	result, data1, data2
+	RET
+END(strcmp)
+libc_hidden_builtin_def (strcmp)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7ac4d962399145b6a913d9d39a38514a7ffc0e9f

commit 7ac4d962399145b6a913d9d39a38514a7ffc0e9f
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:29:05 2013 +0000

    AArch64: Implement optimized bzero.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index baee876..3f8821c 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+        * sysdeps/aarch64/bzero.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
         * sysdeps/aarch64/memmove.S: New file.
 
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
diff --git a/ports/sysdeps/aarch64/bzero.S b/ports/sysdeps/aarch64/bzero.S
new file mode 100644
index 0000000..228c0a5
--- /dev/null
+++ b/ports/sysdeps/aarch64/bzero.S
@@ -0,0 +1,27 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__bzero)
+	mov	x2, x1
+	mov	x1, xzr
+	b	__memset
+END(__bzero)
+weak_alias (__bzero, bzero)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c2b6221ed3be421c2f7a8ad32e70ad501340b55b

commit c2b6221ed3be421c2f7a8ad32e70ad501340b55b
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:28:39 2013 +0000

    AArch64: Implement optimized memmove.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 51b51eb..baee876 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+        * sysdeps/aarch64/memmove.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
         * sysdeps/aarch64/memcpy.S: New file.
 
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
diff --git a/ports/sysdeps/aarch64/memmove.S b/ports/sysdeps/aarch64/memmove.S
new file mode 100644
index 0000000..c42eb1c
--- /dev/null
+++ b/ports/sysdeps/aarch64/memmove.S
@@ -0,0 +1,312 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ */
+
+/* Parameters and result.  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+ENTRY_ALIGN (memmove, 6)
+
+	cmp	dstin, src
+	b.lo	L(downwards)
+	add	tmp1, src, count
+	cmp	dstin, tmp1
+	b.hs	memcpy		/* No overlap.  */
+
+	/* Upwards move with potential overlap.
+	 * Need to move from the tail backwards.  SRC and DST point one
+	 * byte beyond the remaining data to move.  */
+	add	dst, dstin, count
+	add	src, src, count
+	cmp	count, #64
+	b.ge	L(mov_not_short_up)
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+L(tail63up):
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15up)
+	sub	dst, dst, tmp1
+	sub	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #32]
+	stp	A_l, A_h, [dst, #32]
+1:
+	ldp	A_l, A_h, [src, #16]
+	stp	A_l, A_h, [dst, #16]
+2:
+	ldp	A_l, A_h, [src]
+	stp	A_l, A_h, [dst]
+L(tail15up):
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	 * being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src, #-1]
+	strb	tmp1w, [dst, #-1]
+1:
+	RET
+
+L(mov_not_short_up):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src, #-1]!
+	strb	tmp1w, [dst, #-1]!
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63up)
+2:
+	subs	count, count, #128
+	b.ge	L(mov_body_large_up)
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src, #-64]!
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst, #-64]!
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	b.ne	L(tail63up)
+	RET
+
+	/* Critical loop.  Start at a new Icache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(mov_body_large_up):
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #-16]
+	ldp	B_l, B_h, [src, #-32]
+	ldp	C_l, C_h, [src, #-48]
+	ldp	D_l, D_h, [src, #-64]!
+1:
+	stp	A_l, A_h, [dst, #-16]
+	ldp	A_l, A_h, [src, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	ldp	B_l, B_h, [src, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	ldp	C_l, C_h, [src, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	ldp	D_l, D_h, [src, #-64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	tst	count, #0x3f
+	b.ne	L(tail63up)
+	RET
+
+L(downwards):
+	/* For a downwards move we can safely use memcpy provided that
+	 * DST is more than 16 bytes away from SRC.  */
+	sub	tmp1, src, #16
+	cmp	dstin, tmp1
+	b.ls	memcpy		/* May overlap, but not critically.  */
+
+	mov	dst, dstin	/* Preserve DSTIN for return value.  */
+	cmp	count, #64
+	b.ge	L(mov_not_short_down)
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+L(tail63down):
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15down)
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+	stp	A_l, A_h, [dst, #-48]
+1:
+	ldp	A_l, A_h, [src, #-32]
+	stp	A_l, A_h, [dst, #-32]
+2:
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+L(tail15down):
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	   being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+1:
+	RET
+
+L(mov_not_short_down):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63down)
+2:
+	subs	count, count, #128
+	b.ge	L(mov_body_large_down)
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst]
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	L(tail63down)
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(mov_body_large_down):
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+	stp	A_l, A_h, [dst, #16]
+	ldp	A_l, A_h, [src, #16]
+	stp	B_l, B_h, [dst, #32]
+	ldp	B_l, B_h, [src, #32]
+	stp	C_l, C_h, [dst, #48]
+	ldp	C_l, C_h, [src, #48]
+	stp	D_l, D_h, [dst, #64]!
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #16]
+	stp	B_l, B_h, [dst, #32]
+	stp	C_l, C_h, [dst, #48]
+	stp	D_l, D_h, [dst, #64]
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	L(tail63down)
+	RET
+END (memmove)
+
+libc_hidden_builtin_def (memmove)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=857c8d222813f26e5178d3de1f272d1c3d9e24ac

commit 857c8d222813f26e5178d3de1f272d1c3d9e24ac
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:28:03 2013 +0000

    AArch64: Implement optimized memcpy.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 9c7b155..51b51eb 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+        * sysdeps/aarch64/memcpy.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
 	* sysdeps/aarch64/memset.S: New file.
 
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
diff --git a/ports/sysdeps/aarch64/memcpy.S b/ports/sysdeps/aarch64/memcpy.S
new file mode 100644
index 0000000..4f4e36c
--- /dev/null
+++ b/ports/sysdeps/aarch64/memcpy.S
@@ -0,0 +1,176 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+#include <sysdep.h>
+
+ENTRY_ALIGN (memcpy, 6)
+
+	mov	dst, dstin
+	cmp	count, #64
+	b.ge	L(cpy_not_short)
+	cmp	count, #15
+	b.le	L(tail15tiny)
+
+	/* Deal with small copies quickly by dropping straight into the
+	 * exit block.  */
+L(tail63):
+	/* Copy up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15)
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+	stp	A_l, A_h, [dst, #-48]
+1:
+	ldp	A_l, A_h, [src, #-32]
+	stp	A_l, A_h, [dst, #-32]
+2:
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+
+L(tail15):
+	ands	count, count, #15
+	beq	1f
+	add	src, src, count
+	ldp	A_l, A_h, [src, #-16]
+	add	dst, dst, count
+	stp	A_l, A_h, [dst, #-16]
+1:
+	RET
+
+L(tail15tiny):
+	/* Copy up to 15 bytes of data.  Does not assume additional data
+	   being copied.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+1:
+	RET
+
+L(cpy_not_short):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Copy more data than needed; it's faster than jumping
+	 * around copying sub-Quadword quantities.  We know that
+	 * it can't overrun.  */
+	ldp	A_l, A_h, [src]
+	add	src, src, tmp2
+	stp	A_l, A_h, [dst]
+	add	dst, dst, tmp2
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63)
+2:
+	subs	count, count, #128
+	b.ge	L(cpy_body_large)
+	/* Less than 128 bytes to copy, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst]
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	L(tail63)
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(cpy_body_large):
+	/* There are at least 128 bytes to copy.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+	stp	A_l, A_h, [dst, #16]
+	ldp	A_l, A_h, [src, #16]
+	stp	B_l, B_h, [dst, #32]
+	ldp	B_l, B_h, [src, #32]
+	stp	C_l, C_h, [dst, #48]
+	ldp	C_l, C_h, [src, #48]
+	stp	D_l, D_h, [dst, #64]!
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #16]
+	stp	B_l, B_h, [dst, #32]
+	stp	C_l, C_h, [dst, #48]
+	stp	D_l, D_h, [dst, #64]
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	L(tail63)
+	RET
+END (memcpy)
+libc_hidden_builtin_def (memcpy)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=58faa0874bc55f7520ed2255aaa7c5aa42e26292

commit 58faa0874bc55f7520ed2255aaa7c5aa42e26292
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 13:03:43 2013 +0000

    AArch64: Implement optimized memset.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 17a9179..9c7b155 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,9 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+	* sysdeps/aarch64/memset.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
 	* sysdeps/aarch64/sysdep.h (ENTRY_ALIGN): New.
 	* sysdeps/aarch64/memcmp.S: New file.
 
diff --git a/ports/sysdeps/aarch64/memset.S b/ports/sysdeps/aarch64/memset.S
new file mode 100644
index 0000000..f96f6a6
--- /dev/null
+++ b/ports/sysdeps/aarch64/memset.S
@@ -0,0 +1,229 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#include <sysdep.h>
+
+/* By default we assume that the DC instruction can be used to zero
+   data blocks more efficiently.  In some circumstances this might be
+   unsafe, for example in an asymmetric multiprocessor environment with
+   different DC clear lengths (neither the upper nor lower lengths are
+   safe to use).  The feature can be disabled by defining DONT_USE_DC.
+
+   If code may be run in a virtualized environment, then define
+   MAYBE_VIRT.  This will cause the code to cache the system register
+   values rather than re-reading them each call.  */
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len		w5
+#define zva_bits_x	x6
+
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+
+ENTRY_ALIGN (__memset, 6)
+
+	mov	dst, dstin		/* Preserve return value.  */
+	ands	A_lw, val, #255
+#ifndef DONT_USE_DC
+	b.eq	L(zero_mem)
+#endif
+	orr	A_lw, A_lw, A_lw, lsl #8
+	orr	A_lw, A_lw, A_lw, lsl #16
+	orr	A_l, A_l, A_l, lsl #32
+L(tail_maybe_long):
+	cmp	count, #64
+	b.ge	L(not_short)
+L(tail_maybe_tiny):
+	cmp	count, #15
+	b.le	L(tail15tiny)
+L(tail63):
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15)
+	add	dst, dst, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp	A_l, A_l, [dst, #-48]
+1:
+	stp	A_l, A_l, [dst, #-32]
+2:
+	stp	A_l, A_l, [dst, #-16]
+
+L(tail15):
+	and	count, count, #15
+	add	dst, dst, count
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+	RET
+
+L(tail15tiny):
+	/* Set up to 15 bytes.  Does not assume earlier memory
+	   being set.  */
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 1f
+	str	A_lw, [dst], #4
+1:
+	tbz	count, #1, 1f
+	strh	A_lw, [dst], #2
+1:
+	tbz	count, #0, 1f
+	strb	A_lw, [dst]
+1:
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line, this ensures the entire loop is in one line.  */
+	.p2align 6
+L(not_short):
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	2f
+	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
+	 * more than that to set, so we simply store 16 bytes and advance by
+	 * the amount required to reach alignment.  */
+	sub	count, count, tmp2
+	stp	A_l, A_l, [dst]
+	add	dst, dst, tmp2
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63)
+2:
+	sub	dst, dst, #16		/* Pre-bias.  */
+	sub	count, count, #64
+1:
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	stp	A_l, A_l, [dst, #48]
+	stp	A_l, A_l, [dst, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	tst	count, #0x3f
+	add	dst, dst, #16
+	b.ne	L(tail63)
+	RET
+
+#ifndef DONT_USE_DC
+	/* For zeroing memory, check to see if we can use the ZVA feature to
+	 * zero entire 'cache' lines.  */
+L(zero_mem):
+	mov	A_l, #0
+	cmp	count, #63
+	b.le	L(tail_maybe_tiny)
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	1f
+	sub	count, count, tmp2
+	stp	A_l, A_l, [dst]
+	add	dst, dst, tmp2
+	cmp	count, #63
+	b.le	L(tail63)
+1:
+	/* For zeroing small amounts of memory, it's not worth setting up
+	 * the line-clear code.  */
+	cmp	count, #128
+	b.lt	L(not_short)
+#ifdef MAYBE_VIRT
+	/* For efficiency when virtualized, we cache the ZVA capability.  */
+	adrp	tmp2, L(cache_clear)
+	ldr	zva_len, [tmp2, #:lo12:L(cache_clear)]
+	tbnz	zva_len, #31, L(not_short)
+	cbnz	zva_len, L(zero_by_line)
+	mrs	tmp1, dczid_el0
+	tbz	tmp1, #4, 1f
+	/* ZVA not available.  Remember this for next time.  */
+	mov	zva_len, #~0
+	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
+	b	L(not_short)
+1:
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
+#else
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, L(not_short)
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+#endif
+
+L(zero_by_line):
+	/* Compute how far we need to go to become suitably aligned.  We're
+	 * already at quad-word alignment.  */
+	cmp	count, zva_len_x
+	b.lt	L(not_short)		/* Not enough to reach alignment.  */
+	sub	zva_bits_x, zva_len_x, #1
+	neg	tmp2, dst
+	ands	tmp2, tmp2, zva_bits_x
+	b.eq	1f			/* Already aligned.  */
+	/* Not aligned, check that there's enough to copy after alignment.  */
+	sub	tmp1, count, tmp2
+	cmp	tmp1, #64
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+	b.lt	L(not_short)
+	/* We know that there's at least 64 bytes to zero and that it's safe
+	 * to overrun by 64 bytes.  */
+	mov	count, tmp1
+2:
+	stp	A_l, A_l, [dst]
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	subs	tmp2, tmp2, #64
+	stp	A_l, A_l, [dst, #48]
+	add	dst, dst, #64
+	b.ge	2b
+	/* We've overrun a bit, so adjust dst downwards.  */
+	add	dst, dst, tmp2
+1:
+	sub	count, count, zva_len_x
+3:
+	dc	zva, dst
+	add	dst, dst, zva_len_x
+	subs	count, count, zva_len_x
+	b.ge	3b
+	ands	count, count, zva_bits_x
+	b.ne	L(tail_maybe_long)
+	RET
+#ifdef MAYBE_VIRT
+	.bss
+	.p2align 2
+L(cache_clear):
+	.space 4
+#endif
+#endif /* DONT_USE_DC */
+
+END (__memset)
+weak_alias (__memset, memset)
+libc_hidden_builtin_def (memset)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14d941e4dce164097aa6165828d13441ae2a4dbe

commit 14d941e4dce164097aa6165828d13441ae2a4dbe
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 12:58:40 2013 +0000

    AArch64: Implement optimized memcmp.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index 57c93fa..17a9179 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,5 +1,10 @@
 2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
 
+	* sysdeps/aarch64/sysdep.h (ENTRY_ALIGN): New.
+	* sysdeps/aarch64/memcmp.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
 	* sysdeps/aarch64/sysdep.h (ENTRY, END): Adjust
 	whitespace.
 
diff --git a/ports/sysdeps/aarch64/memcmp.S b/ports/sysdeps/aarch64/memcmp.S
new file mode 100644
index 0000000..6398ddd
--- /dev/null
+++ b/ports/sysdeps/aarch64/memcmp.S
@@ -0,0 +1,151 @@
+/* memcmp - compare memory
+
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define endloop		x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define pos		x11
+#define limit_wd	x12
+#define mask		x13
+
+ENTRY_ALIGN (memcmp, 6)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	add	limit_wd, limit, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Start of performance-critical section  -- one 64B cache line.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit_wd, limit_wd, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
+	cbz	endloop, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+	/* Not reached the limit, must have found a diff.  */
+	cbnz	limit_wd, L(not_limit)
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	L(not_limit)
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	orr	diff, diff, mask
+L(not_limit):
+
+#ifndef	__AARCH64EB__
+	rev	diff, diff
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	/* The MS-non-zero bit of DIFF marks either the first bit
+	   that is different, or the end of the significant data.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, diff
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit, #7
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	lsr	limit_wd, limit_wd, #3
+	b	L(start_realigned)
+
+L(ret0):
+	mov	result, #0
+	RET
+
+	.p2align 6
+L(misaligned8):
+	sub	limit, limit, #1
+1:
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	1b
+	sub	result, data1, data2
+	RET
+END (memcmp)
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/ports/sysdeps/aarch64/sysdep.h b/ports/sysdeps/aarch64/sysdep.h
index a0fc329..6b75ada 100644
--- a/ports/sysdeps/aarch64/sysdep.h
+++ b/ports/sysdeps/aarch64/sysdep.h
@@ -33,6 +33,15 @@
   cfi_startproc;						\
   CALL_MCOUNT
 
+/* Define an entry point visible from C.  */
+#define ENTRY_ALIGN(name, align)				\
+  .globl C_SYMBOL_NAME(name);					\
+  .type C_SYMBOL_NAME(name),%function;				\
+  .p2align align;						\
+  C_LABEL(name)							\
+  cfi_startproc;						\
+  CALL_MCOUNT
+
 #undef	END
 #define END(name)						\
   cfi_endproc;							\

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=62216a0a1e79ea5ce17ad99ef9efbb4aa2afb8c0

commit 62216a0a1e79ea5ce17ad99ef9efbb4aa2afb8c0
Author: Marcus Shawcroft <marcus.shawcroft@linaro.org>
Date:   Wed Jan 16 12:53:34 2013 +0000

    AArch64: Tidyup whitespace.

diff --git a/ports/ChangeLog.aarch64 b/ports/ChangeLog.aarch64
index eb9fb97..57c93fa 100644
--- a/ports/ChangeLog.aarch64
+++ b/ports/ChangeLog.aarch64
@@ -1,3 +1,8 @@
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+	* sysdeps/aarch64/sysdep.h (ENTRY, END): Adjust
+	whitespace.
+
 2013-01-10  Joseph Myers  <joseph@codesourcery.com>
 
 	* sysdeps/aarch64/bits/setjmp.h (__jmp_buf): Use __extension__
diff --git a/ports/sysdeps/aarch64/sysdep.h b/ports/sysdeps/aarch64/sysdep.h
index d9469b8..a0fc329 100644
--- a/ports/sysdeps/aarch64/sysdep.h
+++ b/ports/sysdeps/aarch64/sysdep.h
@@ -25,24 +25,24 @@
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name
 
 /* Define an entry point visible from C.  */
-#define ENTRY(name)							      \
-  .globl C_SYMBOL_NAME(name);						      \
-  .type C_SYMBOL_NAME(name),%function;					      \
-  .align 4;								      \
-  C_LABEL(name)								      \
-  cfi_startproc;							      \
+#define ENTRY(name)						\
+  .globl C_SYMBOL_NAME(name);					\
+  .type C_SYMBOL_NAME(name),%function;				\
+  .align 4;							\
+  C_LABEL(name)							\
+  cfi_startproc;						\
   CALL_MCOUNT
 
 #undef	END
-#define END(name)							      \
-  cfi_endproc;								      \
+#define END(name)						\
+  cfi_endproc;							\
   ASM_SIZE_DIRECTIVE(name)
 
 /* If compiled for profiling, call `mcount' at the start of each function.  */
 #ifdef	PROF
-# define CALL_MCOUNT			\
-	str	x30, [sp, #-16]!;	\
-	bl	mcount;			\
+# define CALL_MCOUNT						\
+	str	x30, [sp, #-16]!;				\
+	bl	mcount;						\
 	ldr	x30, [sp], #16	;
 #else
 # define CALL_MCOUNT		/* Do nothing.  */

-----------------------------------------------------------------------

Summary of changes:
 ports/ChangeLog.aarch64         |   34 +++++
 ports/sysdeps/aarch64/bzero.S   |   27 ++++
 ports/sysdeps/aarch64/memcmp.S  |  151 +++++++++++++++++++
 ports/sysdeps/aarch64/memcpy.S  |  176 ++++++++++++++++++++++
 ports/sysdeps/aarch64/memmove.S |  312 +++++++++++++++++++++++++++++++++++++++
 ports/sysdeps/aarch64/memset.S  |  229 ++++++++++++++++++++++++++++
 ports/sysdeps/aarch64/strcmp.S  |  155 +++++++++++++++++++
 ports/sysdeps/aarch64/strlen.S  |  117 +++++++++++++++
 ports/sysdeps/aarch64/sysdep.h  |   31 +++--
 9 files changed, 1221 insertions(+), 11 deletions(-)
 create mode 100644 ports/sysdeps/aarch64/bzero.S
 create mode 100644 ports/sysdeps/aarch64/memcmp.S
 create mode 100644 ports/sysdeps/aarch64/memcpy.S
 create mode 100644 ports/sysdeps/aarch64/memmove.S
 create mode 100644 ports/sysdeps/aarch64/memset.S
 create mode 100644 ports/sysdeps/aarch64/strcmp.S
 create mode 100644 ports/sysdeps/aarch64/strlen.S


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]