This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Use integer constants in mpa.c
- From: Siddhesh Poyarekar <siddhesh at redhat dot com>
- To: libc-alpha at sourceware dot org
- Date: Fri, 22 Mar 2013 15:32:06 +0530
- Subject: [PATCH] Use integer constants in mpa.c
Hi,
As a follow-up to the integral mantissa patch, this patch uses
integral constants throughout mpa.c to prevent the compiler from
unnecessarily generating fp conversion and operations code for it.
Additionally, there were a few places where the compiler predicted the
quick exit branch as favourable, resulting in a further disadvantage
to the slower path. I have added branch prediction hints in these
cases to favour the slower path, thus evening out the performance
characteristics.
I verified the results of this change on x86_64, arm and on powerpc.
The only change in code to powerpc is due to the change in branch
prediction, which does not have a noticeable impact. On x86_64 and
arm however, there is a noticeable impact.
x86_64 without patch:
exp: ITERS:5e+08: TOTAL:11.6349s, MAX:138.288ns, MIN:10.684ns, 4.29741e+07 iter/s
pow: ITERS:2e+08: TOTAL:12.3583s, MAX:189.945ns, MIN:58.128ns, 1.61835e+07 iter/s
slowexp: ITERS:5e+06: TOTAL:9.13459s, MAX:5052.29ns, MIN:1719.01ns, 547370 iter/s
slowpow: ITERS:100000: TOTAL:8.30797s, MAX:97886.4ns, MIN:80192.2ns, 12036.6 iter/s
x86_64 with patch:
exp: ITERS:5e+08: TOTAL:11.312s, MAX:93.428ns, MIN:20.402ns, 4.4201e+07 iter/s
pow: ITERS:2e+08: TOTAL:12.1221s, MAX:193.53ns, MIN:58.075ns, 1.64988e+07 iter/s
slowexp: ITERS:5e+06: TOTAL:6.55884s, MAX:2519.28ns, MIN:1260.72ns, 762329 iter/s
slowpow: ITERS:100000: TOTAL:7.19873s, MAX:75651.4ns, MIN:71147.3ns, 13891.3 iter/s
arm without patch:
This is without the long mantissa patch as well, since that resulted
in a performance regression on arm. Posting that here will only bloat
the performance figures.
exp: ITERS:100000: TOTAL:100.091s, MAX:1.0022e+06ns, MIN:1.00042e+06ns, 999.096 iter/s
pow: ITERS:100000: TOTAL:259.985s, MAX:2.6017e+06ns, MIN:2.59909e+06ns, 384.638 iter/s
arm with patch:
exp: ITERS:100000: TOTAL:71.0013s, MAX:711433ns, MIN:709795ns, 1408.42 iter/s
pow: ITERS:100000: TOTAL:183.513s, MAX:1.8366e+06ns, MIN:1.8347e+06ns, 544.92 iter/s
That's approx 13% improvement on x86_64 and 30% on arm. No
regressions due to this on any of the architectures.
Siddhesh
* sysdeps/ieee754/dbl-64/mpa.c (__acr): Use integral
constants. Add branch hint to favour the slower path.
(denorm): Likewise.
(__mp_dbl): Likewise.
(__dbl_mp): Likewise.
(add_magnitudes): Likewise.
(sub_magnitudes): Likewise.
(__add): Likewise.
(__sub): Likewise.
(__mul): Likewise.
(__sqr): Likewise.
(__dvd): Likewise.
diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index f1ca8ce..bad02b0 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -80,14 +80,14 @@ __acr (const mp_no *x, const mp_no *y, int p)
{
long i;
- if (X[0] == ZERO)
+ if (__glibc_unlikely (X[0] == 0))
{
- if (Y[0] == ZERO)
+ if (Y[0] == 0)
i = 0;
else
i = -1;
}
- else if (Y[0] == ZERO)
+ else if (__glibc_unlikely (Y[0] == 0))
i = 1;
else
{
@@ -112,7 +112,7 @@ __cpy (const mp_no *x, mp_no *y, int p)
long i;
EY = EX;
- for (i = 0; i <= p; i++)
+ for (i = 0; __glibc_likely (i <= p); i++)
Y[i] = X[i];
}
#endif
@@ -140,10 +140,10 @@ norm (const mp_no *x, double *y, int p)
}
else
{
- for (a = ONE, z[1] = X[1]; z[1] < TWO23;)
+ for (a = 1, z[1] = X[1]; z[1] < TWO23;)
{
- a *= TWO;
- z[1] *= TWO;
+ a *= 2;
+ z[1] *= 2;
}
for (i = 2; i < 5; i++)
@@ -160,21 +160,21 @@ norm (const mp_no *x, double *y, int p)
if (v == TWO18)
{
- if (z[4] == ZERO)
+ if (z[4] == 0)
{
for (i = 5; i <= p; i++)
{
- if (X[i] == ZERO)
+ if (X[i] == 0)
continue;
else
{
- z[3] += ONE;
+ z[3] += 1;
break;
}
}
}
else
- z[3] += ONE;
+ z[3] += 1;
}
c = (z[1] + R * (z[2] + R * (double) z[3])) / a;
@@ -202,43 +202,43 @@ denorm (const mp_no *x, double *y, int p)
mantissa_t u, z[5];
#define R RADIXI
- if (EX < -44 || (EX == -44 && X[1] < TWO5))
+ if (__glibc_unlikely (EX < -44 || (EX == -44 && X[1] < TWO5)))
{
*y = ZERO;
return;
}
- if (p2 == 1)
+ if (__glibc_unlikely (p2 == 1))
{
if (EX == -42)
{
z[1] = X[1] + TWO10;
- z[2] = ZERO;
- z[3] = ZERO;
+ z[2] = 0;
+ z[3] = 0;
k = 3;
}
else if (EX == -43)
{
z[1] = TWO10;
z[2] = X[1];
- z[3] = ZERO;
+ z[3] = 0;
k = 2;
}
else
{
z[1] = TWO10;
- z[2] = ZERO;
+ z[2] = 0;
z[3] = X[1];
k = 1;
}
}
- else if (p2 == 2)
+ else if (__glibc_unlikely (p2 == 2))
{
if (EX == -42)
{
z[1] = X[1] + TWO10;
z[2] = X[2];
- z[3] = ZERO;
+ z[3] = 0;
k = 3;
}
else if (EX == -43)
@@ -251,7 +251,7 @@ denorm (const mp_no *x, double *y, int p)
else
{
z[1] = TWO10;
- z[2] = ZERO;
+ z[2] = 0;
z[3] = X[1];
k = 1;
}
@@ -273,7 +273,7 @@ denorm (const mp_no *x, double *y, int p)
else
{
z[1] = TWO10;
- z[2] = ZERO;
+ z[2] = 0;
k = 1;
}
z[3] = X[k];
@@ -285,11 +285,11 @@ denorm (const mp_no *x, double *y, int p)
{
for (i = k + 1; i <= p2; i++)
{
- if (X[i] == ZERO)
+ if (X[i] == 0)
continue;
else
{
- z[3] += ONE;
+ z[3] += 1;
break;
}
}
@@ -306,7 +306,7 @@ denorm (const mp_no *x, double *y, int p)
void
__mp_dbl (const mp_no *x, double *y, int p)
{
- if (X[0] == ZERO)
+ if (X[0] == 0)
{
*y = ZERO;
return;
@@ -329,23 +329,23 @@ __dbl_mp (double x, mp_no *y, int p)
long p2 = p;
/* Sign. */
- if (x == ZERO)
+ if (__glibc_unlikely (x == ZERO))
{
- Y[0] = ZERO;
+ Y[0] = 0;
return;
}
else if (x > ZERO)
- Y[0] = ONE;
+ Y[0] = 1;
else
{
- Y[0] = MONE;
+ Y[0] = -1;
x = -x;
}
/* Exponent. */
- for (EY = ONE; x >= RADIX; EY += ONE)
+ for (EY = 1; x >= RADIX; EY += 1)
x *= RADIXI;
- for (; x < ONE; EY -= ONE)
+ for (; x < ONE; EY -= 1)
x *= RADIX;
/* Digits. */
@@ -356,7 +356,7 @@ __dbl_mp (double x, mp_no *y, int p)
x *= RADIX;
}
for (; i <= p2; i++)
- Y[i] = ZERO;
+ Y[i] = 0;
}
/* Add magnitudes of *X and *Y assuming that abs (*X) >= abs (*Y) > 0. The
@@ -383,39 +383,39 @@ add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
return;
}
- zk = ZERO;
+ zk = 0;
- for (; j > 0; i--, j--)
+ for (; __glibc_likely (j > 0); i--, j--)
{
zk += X[i] + Y[j];
if (zk >= RADIX)
{
Z[k--] = zk - RADIX;
- zk = ONE;
+ zk = 1;
}
else
{
Z[k--] = zk;
- zk = ZERO;
+ zk = 0;
}
}
- for (; i > 0; i--)
+ for (; __glibc_likely (i > 0); i--)
{
zk += X[i];
if (zk >= RADIX)
{
Z[k--] = zk - RADIX;
- zk = ONE;
+ zk = 1;
}
else
{
Z[k--] = zk;
- zk = ZERO;
+ zk = 0;
}
}
- if (zk == ZERO)
+ if (zk == 0)
{
for (i = 1; i <= p2; i++)
Z[i] = Z[i + 1];
@@ -423,7 +423,7 @@ add_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
else
{
Z[1] = zk;
- EZ += ONE;
+ EZ += 1;
}
}
@@ -453,27 +453,27 @@ sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
/* The relevant least significant digit in Y is non-zero, so we factor it in
to enhance accuracy. */
- if (j < p2 && Y[j + 1] > ZERO)
+ if (j < p2 && Y[j + 1] > 0)
{
Z[k + 1] = RADIX - Y[j + 1];
- zk = MONE;
+ zk = -1;
}
else
- zk = Z[k + 1] = ZERO;
+ zk = Z[k + 1] = 0;
/* Subtract and borrow. */
for (; j > 0; i--, j--)
{
zk += (X[i] - Y[j]);
- if (zk < ZERO)
+ if (zk < 0)
{
Z[k--] = zk + RADIX;
- zk = MONE;
+ zk = -1;
}
else
{
Z[k--] = zk;
- zk = ZERO;
+ zk = 0;
}
}
@@ -481,25 +481,25 @@ sub_magnitudes (const mp_no *x, const mp_no *y, mp_no *z, int p)
for (; i > 0; i--)
{
zk += X[i];
- if (zk < ZERO)
+ if (zk < 0)
{
Z[k--] = zk + RADIX;
- zk = MONE;
+ zk = -1;
}
else
{
Z[k--] = zk;
- zk = ZERO;
+ zk = 0;
}
}
/* Normalize. */
- for (i = 1; Z[i] == ZERO; i++);
+ for (i = 1; Z[i] == 0; i++);
EZ = EZ - i + 1;
for (k = 1; i <= p2 + 1;)
Z[k++] = Z[i++];
for (; k <= p2;)
- Z[k++] = ZERO;
+ Z[k++] = 0;
}
/* Add *X and *Y and store the result in *Z. X and Y may overlap, but not X
@@ -511,12 +511,12 @@ __add (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
int n;
- if (X[0] == ZERO)
+ if (X[0] == 0)
{
__cpy (y, z, p);
return;
}
- else if (Y[0] == ZERO)
+ else if (Y[0] == 0)
{
__cpy (x, z, p);
return;
@@ -548,7 +548,7 @@ __add (const mp_no *x, const mp_no *y, mp_no *z, int p)
Z[0] = Y[0];
}
else
- Z[0] = ZERO;
+ Z[0] = 0;
}
}
@@ -561,13 +561,13 @@ __sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
int n;
- if (X[0] == ZERO)
+ if (X[0] == 0)
{
__cpy (y, z, p);
Z[0] = -Z[0];
return;
}
- else if (Y[0] == ZERO)
+ else if (Y[0] == 0)
{
__cpy (x, z, p);
return;
@@ -599,7 +599,7 @@ __sub (const mp_no *x, const mp_no *y, mp_no *z, int p)
Z[0] = -Y[0];
}
else
- Z[0] = ZERO;
+ Z[0] = 0;
}
}
@@ -618,23 +618,23 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
mantissa_store_t *diag;
/* Is z=0? */
- if (__glibc_unlikely (X[0] * Y[0] == ZERO))
+ if (__glibc_unlikely (X[0] * Y[0] == 0))
{
- Z[0] = ZERO;
+ Z[0] = 0;
return;
}
/* We need not iterate through all X's and Y's since it's pointless to
multiply zeroes. Here, both are zero... */
for (ip2 = p2; ip2 > 0; ip2--)
- if (X[ip2] != ZERO || Y[ip2] != ZERO)
+ if (X[ip2] != 0 || Y[ip2] != 0)
break;
- a = X[ip2] != ZERO ? y : x;
+ a = X[ip2] != 0 ? y : x;
/* ... and here, at least one of them is still zero. */
for (ip = ip2; ip > 0; ip--)
- if (a->d[ip] != ZERO)
+ if (a->d[ip] != 0)
break;
/* The product looks like this for p = 3 (as an example):
@@ -665,15 +665,15 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
k = (__glibc_unlikely (p2 < 3)) ? p2 + p2 : p2 + 3;
- while (k > ip + ip2 + 1)
- Z[k--] = ZERO;
+ while (__glibc_unlikely (k > ip + ip2 + 1))
+ Z[k--] = 0;
- zk = ZERO;
+ zk = 0;
/* Precompute sums of diagonal elements so that we can directly use them
later. See the next comment to know we why need them. */
diag = alloca (k * sizeof (mantissa_store_t));
- mantissa_store_t d = ZERO;
+ mantissa_store_t d = 0;
for (i = 1; i <= ip; i++)
{
d += X[i] * (mantissa_store_t) Y[i];
@@ -738,7 +738,7 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
int e = EX + EY;
/* Is there a carry beyond the most significant digit? */
- if (__glibc_unlikely (Z[1] == ZERO))
+ if (__glibc_likely (Z[1] == 0))
{
for (i = 1; i <= p2; i++)
Z[i] = Z[i + 1];
@@ -763,24 +763,24 @@ __sqr (const mp_no *x, mp_no *y, int p)
mantissa_store_t yk;
/* Is z=0? */
- if (__glibc_unlikely (X[0] == ZERO))
+ if (__glibc_unlikely (X[0] == 0))
{
- Y[0] = ZERO;
+ Y[0] = 0;
return;
}
/* We need not iterate through all X's since it's pointless to
multiply zeroes. */
for (ip = p; ip > 0; ip--)
- if (X[ip] != ZERO)
+ if (X[ip] != 0)
break;
k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
while (k > 2 * ip + 1)
- Y[k--] = ZERO;
+ Y[k--] = 0;
- yk = ZERO;
+ yk = 0;
while (k > p)
{
@@ -802,7 +802,7 @@ __sqr (const mp_no *x, mp_no *y, int p)
for (i = k - p, j = p; i < j; i++, j--)
yk2 += X[i] * (mantissa_store_t) X[j];
- yk += 2.0 * yk2;
+ yk += 2 * yk2;
DIV_RADIX (yk, Y[k]);
k--;
@@ -820,7 +820,7 @@ __sqr (const mp_no *x, mp_no *y, int p)
for (i = 1, j = k - 1; i < j; i++, j--)
yk2 += X[i] * (mantissa_store_t) X[j];
- yk += 2.0 * yk2;
+ yk += 2 * yk2;
DIV_RADIX (yk, Y[k]);
k--;
@@ -828,7 +828,7 @@ __sqr (const mp_no *x, mp_no *y, int p)
Y[k] = yk;
/* Squares are always positive. */
- Y[0] = 1.0;
+ Y[0] = 1;
/* Get the exponent sum into an intermediate variable. This is a subtle
optimization, where given enough registers, all operations on the exponent
@@ -836,7 +836,7 @@ __sqr (const mp_no *x, mp_no *y, int p)
int e = EX * 2;
/* Is there a carry beyond the most significant digit? */
- if (__glibc_unlikely (Y[1] == ZERO))
+ if (__glibc_unlikely (Y[1] == 0))
{
for (i = 1; i <= p; i++)
Y[i] = Y[i + 1];
@@ -894,8 +894,8 @@ __dvd (const mp_no *x, const mp_no *y, mp_no *z, int p)
{
mp_no w;
- if (X[0] == ZERO)
- Z[0] = ZERO;
+ if (__glibc_unlikely (X[0] == 0))
+ Z[0] = 0;
else
{
__inv (y, &w, p);