This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Fix some more dbl-64/s_fma.c issues


On Thu, Oct 14, 2010 at 05:37:29PM +0200, Jakub Jelinek wrote:
> it is smaller than DBL_MIN / 4.0, IEEE 754 needs 2 guard bits and sticky
> bit) - then the oring in of INEXACT exception bit from round to zero
> addition is into a sticky or smaller bit and thus IMHO round to nearest
> should handle it properly.  Strangely attached mpfr random tester actually
> just shows issues when the subnormal is in DBL_MIN / 2.0 to DBL_MIN -
> DBL_DENORM_MIN range, though of course that's not proof.
> When the result is subnormal with highest bit of mantissa set, the
> v.ieee.mantissa1 |= j;
> changes the high guard bit, so it is not surprising that it doesn't
> round correctly in many cases.

Actually, scratch that patch, this one instead seems to work
(and passes the testfma.c test with 100million iterations).

The only problematic case is v.ieee.exponent == 106, i.e. when the subnormal
(with round to zero) would be DBL_MIN / 2.0 to DBL_MIN - __DBL_DENORM_MIN__.
If we need to add inexact bit in, we can't or it into the round bit,
as if new LSB bit is 0, round bit is 1 and our virtual sticky bit 1,
it would still round down instead of up.  All other cases for
v.ieee.exponent == 106 can be handled just by normal round to nearest
v.d * 0x1p-106 (similarly if our virtual sticky bit is 0).
If v.ieee.exponent is smaller than 106, then the subnormal will be
smaller (or equal if rounding up) than DBL_MIN / 2.0, we have a spot
where we can or in the virtual sticky bit and then just multiply.
If v.ieee.exponent is bigger than 106, we can and should perform both the
addition and multiplication in round to nearest, instead of doing addition
in round to zero with remembered inexact (i.e. like round to odd) and
multiplication with round to nearest.

Trying 10billion iterations now.

2010-10-14  Jakub Jelinek  <jakub@redhat.com>

	[BZ #3268]
	* math/libm-test.inc (fma_test): Add some more tests.
	* sysdeps/ieee754/dbl-64/s_fma.c (__fma): Handle underflows
	correctly.

--- libc/math/libm-test.inc.jj	2010-10-14 09:08:58.000000000 +0200
+++ libc/math/libm-test.inc	2010-10-14 20:21:24.000000000 +0200
@@ -2808,6 +2808,16 @@ fma_test (void)
   TEST_fff_f (fma, 0x1.fffffffffffffp+1023, 0x1.001p+0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1011);
   TEST_fff_f (fma, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+0, 0x1.fffffffffffffp+1023, -0x1.ffffffffffffdp+1023);
   TEST_fff_f (fma, 0x1.fffffffffffffp+1023, 2.0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1023);
+  TEST_fff_f (fma, 0x1.6a09e667f3bccp-538, 0x1.6a09e667f3bccp-538, 0.0, 0.0);
+  TEST_fff_f (fma, 0x1.deadbeef2feedp-495, 0x1.deadbeef2feedp-495, -0x1.bf86a5786a574p-989, 0x0.0000042625a1fp-1022);
+  TEST_fff_f (fma, 0x1.deadbeef2feedp-503, 0x1.deadbeef2feedp-503, -0x1.bf86a5786a574p-1005, 0x0.0000000004262p-1022);
+  TEST_fff_f (fma, 0x1p-537, 0x1p-538, 0x1p-1074, 0x0.0000000000002p-1022);
+  TEST_fff_f (fma, 0x1.7fffff8p-968, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000001p-1022);
+  TEST_fff_f (fma, 0x1.4000004p-967, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000003p-1022);
+  TEST_fff_f (fma, 0x1.4p-967, -0x1p-106, -0x0.000001p-1022, -0x0.0000010000002p-1022);
+  TEST_fff_f (fma, -0x1.19cab66d73e17p-959, 0x1.c7108a8c5ff51p-107, -0x0.80b0ad65d9b64p-1022, -0x0.80b0ad65d9d59p-1022);
+  TEST_fff_f (fma, -0x1.d2eaed6e8e9d3p-979, -0x1.4e066c62ac9ddp-63, -0x0.9245e6b003454p-1022, -0x0.9245c09c5fb5dp-1022);
+  TEST_fff_f (fma, 0x1.153d650bb9f06p-907, 0x1.2d01230d48407p-125, -0x0.b278d5acfc3cp-1022, -0x0.b22757123bbe9p-1022);
 #endif
 
   END (fma);
--- libc/sysdeps/ieee754/dbl-64/s_fma.c.jj	2010-10-14 09:08:58.000000000 +0200
+++ libc/sysdeps/ieee754/dbl-64/s_fma.c	2010-10-14 20:26:05.000000000 +0200
@@ -39,15 +39,20 @@ __fma (double x, double y, double z)
 			>= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG, 0)
       || __builtin_expect (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
       || __builtin_expect (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
-      || __builtin_expect (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0))
+      || __builtin_expect (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
+      || __builtin_expect (u.ieee.exponent + v.ieee.exponent
+			   <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG, 0))
     {
-      /* If x or y or z is Inf/NaN or if fma will certainly overflow,
+      /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
+	 or if x * y is less than half of DBL_DENORM_MIN,
 	 compute as x * y + z.  */
       if (u.ieee.exponent == 0x7ff
 	  || v.ieee.exponent == 0x7ff
 	  || w.ieee.exponent == 0x7ff
 	  || u.ieee.exponent + v.ieee.exponent
-	     > 0x7ff + IEEE754_DOUBLE_BIAS)
+	     > 0x7ff + IEEE754_DOUBLE_BIAS
+	  || u.ieee.exponent + v.ieee.exponent
+	     < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2)
 	return x * y + z;
       if (u.ieee.exponent + v.ieee.exponent
 	  >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
@@ -87,7 +92,7 @@ __fma (double x, double y, double z)
 	  else
 	    v.d *= 0x1p53;
 	}
-      else
+      else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
 	{
 	  v.ieee.exponent -= DBL_MANT_DIG;
 	  if (u.ieee.exponent)
@@ -95,6 +100,24 @@ __fma (double x, double y, double z)
 	  else
 	    u.d *= 0x1p53;
 	}
+      else /* if (u.ieee.exponent + v.ieee.exponent
+		  <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */
+	{
+	  if (u.ieee.exponent > v.ieee.exponent)
+	    u.ieee.exponent += 2 * DBL_MANT_DIG;
+	  else
+	    v.ieee.exponent += 2 * DBL_MANT_DIG;
+	  if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4)
+	    {
+	      if (w.ieee.exponent)
+		w.ieee.exponent += 2 * DBL_MANT_DIG;
+	      else
+		w.d *= 0x1p106;
+	      adjust = -1;
+	    }
+	  /* Otherwise x * y should just affect inexact
+	     and nothing else.  */
+	}
       x = u.d;
       y = v.d;
       z = w.d;
@@ -123,18 +146,68 @@ __fma (double x, double y, double z)
   fesetround (FE_TOWARDZERO);
   /* Perform m2 + a2 addition with round to odd.  */
   u.d = a2 + m2;
-  if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
-    u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
-  feupdateenv (&env);
-
-  /* Add that to a1.  */
-  a1 = a1 + u.d;
 
-  /* And adjust exponent if needed.  */
-  if (__builtin_expect (adjust, 0))
-    a1 *= 0x1p53;
-
-  return a1;
+  if (__builtin_expect (adjust == 0, 1))
+    {
+      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+	u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
+      feupdateenv (&env);
+      /* Result is a1 + u.d.  */
+      return a1 + u.d;
+    }
+  else if (__builtin_expect (adjust > 0, 1))
+    {
+      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
+	u.ieee.mantissa1 |= fetestexcept (FE_INEXACT) != 0;
+      feupdateenv (&env);
+      /* Result is a1 + u.d, scaled up.  */
+      return (a1 + u.d) * 0x1p53;
+    }
+  else
+    {
+      v.d = a1 + u.d;
+      int j = fetestexcept (FE_INEXACT) != 0;
+      feupdateenv (&env);
+      /* Ensure the following computations are performed in default rounding
+	 mode instead of just reusing the round to zero computation.  */
+      asm volatile ("" : "=m" (u) : "m" (u));
+      /* If a1 + u.d is exact, the only rounding happens during
+	 scaling down.  */
+      if (j == 0)
+	return v.d * 0x1p-106;
+      /* If result rounded to zero is not subnormal, no double
+	 rounding will occur.  */
+      if (v.ieee.exponent > 106)
+	return (a1 + u.d) * 0x1p-106;
+      /* If v.d * 0x1p-106 with round to zero is a subnormal above
+	 or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
+	 down just by 1 bit, which means v.ieee.mantissa1 |= j would
+	 change the round bit, not sticky or guard bit.
+	 v.d * 0x1p-106 never normalizes by shifting up,
+	 so round bit plus sticky bit should be already enough
+	 for proper rounding.  */
+      if (v.ieee.exponent == 106)
+	{
+	  /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
+	     v.ieee.mantissa1 & 1 is the round bit and j is our sticky
+	     bit.  In round-to-nearest 001 rounds down like 00,
+	     011 rounds up, even though 01 rounds down (thus we need
+	     to adjust), 101 rounds down like 10 and 111 rounds up
+	     like 11.  */
+	  if ((v.ieee.mantissa1 & 3) == 1)
+	    {
+	      v.d *= 0x1p-106;
+	      if (v.ieee.negative)
+		return v.d - 0x1p-1074 /* __DBL_DENORM_MIN__ */;
+	      else
+		return v.d + 0x1p-1074 /* __DBL_DENORM_MIN__ */;
+	    }
+	  else
+	    return v.d * 0x1p-106;
+	}
+      v.ieee.mantissa1 |= j;
+      return v.d * 0x1p-106;
+    }
 }
 #ifndef __fma
 weak_alias (__fma, fma)

	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]