This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH v3] Use long for mantissa for generic mp code

From: Siddhesh Poyarekar <siddhesh at redhat dot com>
To: Richard Henderson <rth at twiddle dot net>
Cc: libc-alpha <libc-alpha at sourceware dot org>
Date: Fri, 22 Mar 2013 14:56:54 +0530
Subject: [PATCH v3] Use long for mantissa for generic mp code
References: <512D7D56 dot 2030209 at twiddle dot net> <20130308123807 dot GA21984 at spoyarek dot pnq dot redhat dot com> <5140AD70 dot 7090808 at twiddle dot net> <20130314082826 dot GT22471 at spoyarek dot pnq dot redhat dot com> <51432D0E dot 60407 at twiddle dot net> <CAAHN_R13LUMFd+YYrqj8_FRVdbsHG65ettLfQV81Uy_06ASmOQ at mail dot gmail dot com> <CAAHN_R15C_K4vabAWH863xNdZ6dRxH_rxnV+mFxWADkxmmgFRQ at mail dot gmail dot com> <514339B4 dot 1060307 at twiddle dot net> <51434720 dot 2070601 at twiddle dot net> <51434EAF dot 4090706 at twiddle dot net>

Hi,

Here's an updated version of the patch, tested on i686, x86_64 and
armv6.  There were no code regressions resulting from this patch in
any of the platforms.  There is however a performance regression in
arm where the benchmark for exp runs 1.7 times slower (completed in
~170s as opposed to ~100s without the patch).  This is because there
were a number of double constants that I had left in place, which
resulted in a lot of unnecessary fp operations (including conversion
to/from fp).

I already have a patch to fix this performance regression, which I'll
send shortly as a follow-up on this one.  Upon application of that
patch, the powerpc code is unaffected, but the arm code and x86_64
code are faster.  The arm code is fast enough to resolve the
performance regression and improve further by 30%.  I'll give details
of the same in the email where I actually post the patch.

How does this look now?

Siddhesh

	* sysdeps/ieee754/dbl-64/mpa-arch.h: New file.
	* sysdeps/ieee754/dbl-64/mpa.c (norm): Use MANTISSA_T and
	MANTISSA_STORE_T to store computations on mantissa.  Use
	macros for rounding and division.
	(denorm): Likewise.
	(__dbl_mp): Likewise.
	(add_magnitudes): Likewise.
	(sub_magnitudes): Likewise.
	(__mul): Likewise.
	(__sqr): Likewise.
	* sysdeps/ieee754/dbl-64/mpa.h: Include mpa-arch.h.  Define
	powers of two in terms of TWOPOW macro.
	(mp_no): Make type of mantissa as MANTISSA_T.
	[!RADIXI]: Define RADIXI.
	[!TWO52]: Define TWO52.
	* sysdeps/powerpc/power4/fpu/mpa-arch.h: New file.

diff --git a/sysdeps/ieee754/dbl-64/mpa-arch.h b/sysdeps/ieee754/dbl-64/mpa-arch.h
new file mode 100644
index 0000000..7de9d51
--- /dev/null
+++ b/sysdeps/ieee754/dbl-64/mpa-arch.h
@@ -0,0 +1,47 @@
+/* Overridable constants and operations.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+typedef long mantissa_t;
+typedef int64_t mantissa_store_t;
+
+#define TWOPOW(i) (1L << i)
+
+#define RADIX_EXP 24
+#define  RADIX TWOPOW (RADIX_EXP)		/* 2^24    */
+
+/* Divide D by RADIX and put the remainder in R.  D must be a non-negative
+   integral value.  */
+#define DIV_RADIX(d, r) \
+  ({									      \
+    r = d & (RADIX - 1);						      \
+    d >>= RADIX_EXP;							      \
+  })
+
+/* Put the integer component of a double X in R and retain the fraction in
+   X.  This is used in extracting mantissa digits for MP_NO by using the
+   integer portion of the current value of the number as the current mantissa
+   digit and then scaling by RADIX to get the next mantissa digit in the same
+   manner.  */
+#define INTEGER_OF(x, i) \
+  ({									      \
+    i = (mantissa_t) x;							      \
+    x -= i;								      \
+  })
+
+/* Align IN down to F.  The code assumes that F is a power of two.  */
+#define ALIGN_DOWN_TO(in, f) ((in) & -(f))
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index 0766476..c238ccf 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -125,7 +125,8 @@ norm (const mp_no *x, double *y, int p)
 {
 #define R RADIXI
   long i;
-  double a, c, u, v, z[5];
+  double c;
+  mantissa_t a, u, v, z[5];
   if (p < 5)
     {
       if (p == 1)
@@ -147,17 +148,14 @@ norm (const mp_no *x, double *y, int p)
 
       for (i = 2; i < 5; i++)
 	{
-	  z[i] = X[i] * a;
-	  u = (z[i] + CUTTER) - CUTTER;
-	  if (u > z[i])
-	    u -= RADIX;
-	  z[i] -= u;
-	  z[i - 1] += u * RADIXI;
+	  mantissa_store_t d, r;
+	  d = X[i] * (mantissa_store_t) a;
+	  DIV_RADIX (d, r);
+	  z[i] = r;
+	  z[i - 1] += d;
 	}
 
-      u = (z[3] + TWO71) - TWO71;
-      if (u > z[3])
-	u -= TWO19;
+      u = ALIGN_DOWN_TO (z[3], TWO19);
       v = z[3] - u;
 
       if (v == TWO18)
@@ -200,7 +198,8 @@ denorm (const mp_no *x, double *y, int p)
 {
   long i, k;
   long p2 = p;
-  double c, u, z[5];
+  double c;
+  mantissa_t u, z[5];
 
 #define R RADIXI
   if (EX < -44 || (EX == -44 && X[1] < TWO5))
@@ -280,9 +279,7 @@ denorm (const mp_no *x, double *y, int p)
       z[3] = X[k];
     }
 
-  u = (z[3] + TWO57) - TWO57;
-  if (u > z[3])
-    u -= TWO5;
+  u = ALIGN_DOWN_TO (z[3], TWO5);
 
   if (u == z[3])
     {
@@ -330,7 +327,6 @@ __dbl_mp (double x, mp_no *y, int p)
 {
   long i, n;
   long p2 = p;
-  double u;
 
   /* Sign.  */
   if (x == ZERO)
@@ -356,11 +352,7 @@ __dbl_mp (double x, mp_no *y, int p)
   n = MIN (p2, 4);
   for (i = 1; i <= n; i++)
     {
-      u = (x + TWO52) - TWO52;
-      if (u > x)
-	u -= ONE;
-      Y[i] = u;
-      x -= u;
+      INTEGER_OF (x, Y[i]);
       x *= RADIX;
     }
   for (; i <= p2; i++)
@@ -377,7 +369,7 @@ add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 {
   long i, j, k;
   long p2 = p;
-  double zk;
+  mantissa_t zk;
 
   EZ = EX;
 
@@ -445,7 +437,7 @@ sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
 {
   long i, j, k;
   long p2 = p;
-  double zk;
+  mantissa_t zk;
 
   EZ = EX;
   i = p2;
@@ -621,9 +613,9 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
 {
   long i, j, k, ip, ip2;
   long p2 = p;
-  double u, zk;
+  mantissa_store_t zk;
   const mp_no *a;
-  double *diag;
+  mantissa_store_t *diag;
 
   /* Is z=0?  */
   if (__glibc_unlikely (X[0] * Y[0] == ZERO))
@@ -680,11 +672,11 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
 
   /* Precompute sums of diagonal elements so that we can directly use them
      later.  See the next comment to know we why need them.  */
-  diag = alloca (k * sizeof (double));
-  double d = ZERO;
+  diag = alloca (k * sizeof (mantissa_store_t));
+  mantissa_store_t d = ZERO;
   for (i = 1; i <= ip; i++)
     {
-      d += X[i] * Y[i];
+      d += X[i] * (mantissa_store_t) Y[i];
       diag[i] = d;
     }
   while (i < k)
@@ -697,18 +689,15 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
       if (k % 2 == 0)
 	/* We want to add this only once, but since we subtract it in the sum
 	   of products above, we add twice.  */
-	zk += 2 * X[lim] * Y[lim];
+	zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 
       for (i = k - p2, j = p2; i < j; i++, j--)
-	zk += (X[i] + X[j]) * (Y[i] + Y[j]);
+	zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 
       zk -= diag[k - 1];
 
-      u = (zk + CUTTER) - CUTTER;
-      if (u > zk)
-	u -= RADIX;
-      Z[k--] = zk - u;
-      zk = u * RADIXI;
+      DIV_RADIX (zk, Z[k]);
+      k--;
     }
 
   /* The real deal.  Mantissa digit Z[k] is the sum of all X[i] * Y[j] where i
@@ -731,18 +720,15 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
       if (k % 2 == 0)
 	/* We want to add this only once, but since we subtract it in the sum
 	   of products above, we add twice.  */
-        zk += 2 * X[lim] * Y[lim];
+        zk += 2 * X[lim] * (mantissa_store_t) Y[lim];
 
       for (i = 1, j = k - 1; i < j; i++, j--)
-	zk += (X[i] + X[j]) * (Y[i] + Y[j]);
+	zk += (X[i] + X[j]) * (mantissa_store_t) (Y[i] + Y[j]);
 
       zk -= diag[k - 1];
 
-      u = (zk + CUTTER) - CUTTER;
-      if (u > zk)
-	u -= RADIX;
-      Z[k--] = zk - u;
-      zk = u * RADIXI;
+      DIV_RADIX (zk, Z[k]);
+      k--;
     }
   Z[k] = zk;
 
@@ -774,7 +760,7 @@ SECTION
 __sqr (const mp_no *x, mp_no *y, int p)
 {
   long i, j, k, ip;
-  double u, yk;
+  mantissa_store_t yk;
 
   /* Is z=0?  */
   if (__glibc_unlikely (X[0] == ZERO))
@@ -798,11 +784,11 @@ __sqr (const mp_no *x, mp_no *y, int p)
 
   while (k > p)
     {
-      double yk2 = 0.0;
+      mantissa_store_t yk2 = 0;
       long lim = k / 2;
 
       if (k % 2 == 0)
-	yk += X[lim] * X[lim];
+	yk += X[lim] * (mantissa_store_t) X[lim];
 
       /* In __mul, this loop (and the one within the next while loop) run
          between a range to calculate the mantissa as follows:
@@ -814,36 +800,30 @@ __sqr (const mp_no *x, mp_no *y, int p)
 	 result.  For cases where the range size is even, the mid-point needs
 	 to be added separately (above).  */
       for (i = k - p, j = p; i < j; i++, j--)
-	yk2 += X[i] * X[j];
+	yk2 += X[i] * (mantissa_store_t) X[j];
 
       yk += 2.0 * yk2;
 
-      u = (yk + CUTTER) - CUTTER;
-      if (u > yk)
-	u -= RADIX;
-      Y[k--] = yk - u;
-      yk = u * RADIXI;
+      DIV_RADIX (yk, Y[k]);
+      k--;
     }
 
   while (k > 1)
     {
-      double yk2 = 0.0;
+      mantissa_store_t yk2 = 0;
       long lim = k / 2;
 
       if (k % 2 == 0)
-	yk += X[lim] * X[lim];
+	yk += X[lim] * (mantissa_store_t) X[lim];
 
       /* Likewise for this loop.  */
       for (i = 1, j = k - 1; i < j; i++, j--)
-	yk2 += X[i] * X[j];
+	yk2 += X[i] * (mantissa_store_t) X[j];
 
       yk += 2.0 * yk2;
 
-      u = (yk + CUTTER) - CUTTER;
-      if (u > yk)
-	u -= RADIX;
-      Y[k--] = yk - u;
-      yk = u * RADIXI;
+      DIV_RADIX (yk, Y[k]);
+      k--;
     }
   Y[k] = yk;
 
diff --git a/sysdeps/ieee754/dbl-64/mpa.h b/sysdeps/ieee754/dbl-64/mpa.h
index 168b334..54044a0 100644
--- a/sysdeps/ieee754/dbl-64/mpa.h
+++ b/sysdeps/ieee754/dbl-64/mpa.h
@@ -35,6 +35,7 @@
 /* Common types and definition                                          */
 /************************************************************************/
 
+#include <mpa-arch.h>
 
 /* The mp_no structure holds the details of a multi-precision floating point
    number.
@@ -61,7 +62,7 @@
 typedef struct
 {
   int e;
-  double d[40];
+  mantissa_t d[40];
 } mp_no;
 
 typedef union
@@ -82,9 +83,13 @@ extern const mp_no mptwo;
 
 #define ABS(x)   ((x) <  0  ? -(x) : (x))
 
-#define  RADIX     0x1.0p24		/* 2^24    */
-#define  RADIXI    0x1.0p-24		/* 2^-24   */
-#define  CUTTER    0x1.0p76		/* 2^76    */
+#ifndef RADIXI
+# define  RADIXI    0x1.0p-24		/* 2^-24   */
+#endif
+
+#ifndef TWO52
+# define  TWO52     0x1.0p52		/* 2^52    */
+#endif
 
 #define  ZERO      0.0			/* 0       */
 #define  MZERO     -0.0			/* 0 with the sign bit set */
@@ -92,13 +97,13 @@ extern const mp_no mptwo;
 #define  MONE      -1.0			/* -1      */
 #define  TWO       2.0			/*  2      */
 
-#define  TWO5      0x1.0p5		/* 2^5     */
-#define  TWO8      0x1.0p8		/* 2^52    */
-#define  TWO10     0x1.0p10		/* 2^10    */
-#define  TWO18     0x1.0p18		/* 2^18    */
-#define  TWO19     0x1.0p19		/* 2^19    */
-#define  TWO23     0x1.0p23		/* 2^23    */
-#define  TWO52     0x1.0p52		/* 2^52    */
+#define  TWO5      TWOPOW (5)		/* 2^5     */
+#define  TWO8      TWOPOW (8)		/* 2^52    */
+#define  TWO10     TWOPOW (10)		/* 2^10    */
+#define  TWO18     TWOPOW (18)		/* 2^18    */
+#define  TWO19     TWOPOW (19)		/* 2^19    */
+#define  TWO23     TWOPOW (23)		/* 2^23    */
+
 #define  TWO57     0x1.0p57		/* 2^57    */
 #define  TWO71     0x1.0p71		/* 2^71    */
 #define  TWOM1032  0x1.0p-1032		/* 2^-1032 */
diff --git a/sysdeps/powerpc/power4/fpu/mpa-arch.h b/sysdeps/powerpc/power4/fpu/mpa-arch.h
new file mode 100644
index 0000000..9007c9d
--- /dev/null
+++ b/sysdeps/powerpc/power4/fpu/mpa-arch.h
@@ -0,0 +1,56 @@
+/* Overridable constants and operations.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as published by
+   the Free Software Foundation; either version 2.1 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
+
+typedef double mantissa_t;
+typedef double mantissa_store_t;
+
+#define TWOPOW(i) (0x1.0p##i)
+
+#define RADIX TWOPOW (24)		/* 2^24    */
+#define CUTTER TWOPOW (76)		/* 2^76    */
+#define RADIXI 0x1.0p-24		/* 2^-24 */
+#define TWO52 TWOPOW (52)		/* 2^52 */
+
+/* Divide D by RADIX and put the remainder in R.  */
+#define DIV_RADIX(d,r) \
+  ({									      \
+    double u = ((d) + CUTTER) - CUTTER;					      \
+    if (u > (d))							      \
+      u -= RADIX;							      \
+    r = (d) - u;							      \
+    (d) = u * RADIXI;							      \
+  })
+
+/* Put the integer component of a double X in R and retain the fraction in
+   X.  */
+#define INTEGER_OF(x, r) \
+  ({									      \
+    double u = ((x) + TWO52) - TWO52;					      \
+    if (u > (x))							      \
+      u -= ONE;								      \
+    (r) = u;								      \
+    (x) -= u;								      \
+  })
+
+/* Align IN down to a multiple of F, where F is a power of two.  */
+#define ALIGN_DOWN_TO(in, f) \
+  ({									      \
+    double factor = f * TWO52;						      \
+    double u = (in + factor) - factor;					      \
+    if (u > in)								      \
+      u -= f;								      \
+    u;									      \
+  })

Follow-Ups:
- Re: [PATCH v3] Use long for mantissa for generic mp code
  - From: Richard Henderson

References:
- [PATCH v2] Use long for mantissa for generic mp code
  - From: Siddhesh Poyarekar
- Re: [PATCH v2] Use long for mantissa for generic mp code
  - From: Richard Henderson
- [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Siddhesh Poyarekar
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Richard Henderson
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Siddhesh Poyarekar
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Siddhesh Poyarekar
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Richard Henderson
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Richard Henderson
- Re: [PATCH v2.1] Use long for mantissa for generic mp code
  - From: Richard Henderson

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]