This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] PowerPC: Set/restore rounding mode only when needed


On Mon, Nov 11, 2013 at 11:37:41AM -0200, Adhemerval Zanella wrote:
> Hi all,
> 
> This is mostly a follow up for patch 2506109403de69bd454de27835d42e6eb6ec3abc which
> added context support for change/restore FPU rounding mode/exceptions. Similar to
> x86_64 I noted the mentioned benchmarks (cos, exp, pow, sin, tan) have a higher 
> percentage of it time being spend in the fexxx family of functions. 
> 
> This patche helps it on performance side by adding the libc_fexxx variant of inline
> functions to handle both FPU round and exception set/restore and by using them on the
> libc_fexxx_ctx functions. I based its implementation on already coded fexxx family functions
> for PPC with fpu.
> 
> He is summary of performance improvements due this patch (measured on a POWER7 machine):
> 
> Before:
> 
> cos(): ITERS:9.5895e+07: TOTAL:5116.03Mcy, MAX:77.6cy, MIN:49.792cy, 18744 calls/Mcy
> exp(): ITERS:2.827e+07: TOTAL:5187.15Mcy, MAX:494.018cy, MIN:38.422cy, 5450.01 calls/Mcy
> pow(): ITERS:6.1705e+07: TOTAL:5144.26Mcy, MAX:171.95cy, MIN:29.935cy, 11994.9 calls/Mcy
> sin(): ITERS:8.6898e+07: TOTAL:5117.06Mcy, MAX:83.841cy, MIN:46.582cy, 16982 calls/Mcy
> tan(): ITERS:2.9473e+07: TOTAL:5115.39Mcy, MAX:191.017cy, MIN:172.352cy, 5761.63 calls/Mcy
> 
> After 2:
> 
> cos(): ITERS:2.05265e+08: TOTAL:5111.37Mcy, MAX:78.754cy, MIN:24.196cy, 40158.5 calls/Mcy
> exp(): ITERS:3.341e+07: TOTAL:5170.84Mcy, MAX:476.317cy, MIN:15.574cy, 6461.23 calls/Mcy
> pow(): ITERS:7.6153e+07: TOTAL:5129.1Mcy, MAX:147.5cy, MIN:30.916cy, 14847.2 calls/Mcy
> sin(): ITERS:1.58816e+08: TOTAL:5115.11Mcy, MAX:1490.39cy, MIN:22.341cy, 31048.4 calls/Mcy
> tan(): ITERS:3.4964e+07: TOTAL:5114.18Mcy, MAX:177.422cy, MIN:146.115cy, 6836.68 calls/Mcy
> 
> Checked on PPC32 and PPC64 with no regressions.
> 
> ---
> 
> 2013-11-11  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
> 
> 	* sysdeps/ieee754/ldbl-128ibm/e_expl.c (__ieee754_expl): Use
> 	SET_RESTORE_ROUND instead of feholdexcept/fesetround/fesetenv.
> 	* sysdeps/powerpc/fpu/fenv_libc.h (__fegetround): Remove define.
> 	(__fesetround): Remove define.
> 	* sysdeps/powerpc/fpu/fenv_private.h: New file: Inline floating point
> 	rounding and exceptions handling.
> 	* sysdeps/powerpc/fpu/math_private.h: Include fenv_private.h.
> 
> --
> 
> index f7c50bf..65ef185 100644
> --- a/sysdeps/ieee754/ldbl-128ibm/e_expl.c
> +++ b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
> @@ -134,18 +134,17 @@ static const long double C[] = {
>  long double
>  __ieee754_expl (long double x)
>  {
> +  long double result, x22;
> +  union ibm_extended_long_double ex2_u, scale_u;
> +  int unsafe;
> +
>    /* Check for usual case.  */
>    if (isless (x, himark) && isgreater (x, lomark))
>      {
> -      int tval1, tval2, unsafe, n_i, exponent2;
> -      long double x22, n, result, xl;
> -      union ibm_extended_long_double ex2_u, scale_u;
> -      fenv_t oldenv;
> -
> -      feholdexcept (&oldenv);
> -#ifdef FE_TONEAREST
> -      fesetround (FE_TONEAREST);
> -#endif
> +      int tval1, tval2, n_i, exponent2;
> +      long double n, xl;
> +
> +      SET_RESTORE_ROUND (FE_TONEAREST);
>  
>        n = __roundl (x*M_1_LN2);
>        x = x-n*M_LN2_0;
> @@ -201,11 +200,6 @@ __ieee754_expl (long double x)
>  	 less than 4.8e-39.  */
>        x22 = x + x*x*(P1+x*(P2+x*(P3+x*(P4+x*(P5+x*P6)))));
>  
> -      /* Return result.  */
> -      fesetenv (&oldenv);
> -
> -      result = x22 * ex2_u.ld + ex2_u.ld;
> -
>        /* Now we can test whether the result is ultimate or if we are unsure.
>  	 In the later case we should probably call a mpn based routine to give
>  	 the ultimate result.
> @@ -235,10 +229,6 @@ __ieee754_expl (long double x)
>  	    return __ieee754_expl_proc2 (origx);
>  	  }
>         */
> -      if (!unsafe)
> -	return result;
> -      else
> -	return result * scale_u.ld;
>      }
>    /* Exceptional cases:  */
>    else if (isless (x, himark))
> @@ -253,5 +243,10 @@ __ieee754_expl (long double x)
>    else
>      /* Return x, if x is a NaN or Inf; or overflow, otherwise.  */
>      return TWO1023*x;
> +
> +  result = x22 * ex2_u.ld + ex2_u.ld;
> +  if (!unsafe)
> +    return result;
> +  return result * scale_u.ld;
>  }
>  strong_alias (__ieee754_expl, __expl_finite)
> diff --git a/sysdeps/powerpc/fpu/fenv_libc.h b/sysdeps/powerpc/fpu/fenv_libc.h
> index baa2a7d..f3ae5d1 100644
> --- a/sysdeps/powerpc/fpu/fenv_libc.h
> +++ b/sysdeps/powerpc/fpu/fenv_libc.h
> @@ -81,7 +81,6 @@ __fegetround (void)
>  		"mfcr  %0" : "=r"(result) : : "cr7");
>    return result & 3;
>  }
> -#define fegetround() __fegetround()
>  
>  static inline int
>  __fesetround (int round)
> @@ -105,7 +104,6 @@ __fesetround (int round)
>  
>    return 0;
>  }
> -#define fesetround(mode) __fesetround(mode)
>  
>  /* Definitions of all the FPSCR bit numbers */
>  enum {
> diff --git a/sysdeps/powerpc/fpu/fenv_private.h b/sysdeps/powerpc/fpu/fenv_private.h
> new file mode 100644
> index 0000000..0752718
> --- /dev/null
> +++ b/sysdeps/powerpc/fpu/fenv_private.h
> @@ -0,0 +1,265 @@
> +/* Private floating point rounding and exceptions handling. PowerPC version.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef FENV_PRIVATE_H
> +#define FENV_PRIVATE_H 1
> +
> +#include <fenv.h>
> +#include <fenv_libc.h>
> +#include <fpu_control.h>
> +
> +#define _FPU_MASK_ALL (_FPU_MASK_ZM | _FPU_MASK_OM | _FPU_MASK_UM \
> +                      | _FPU_MASK_XM | _FPU_MASK_IM)
> +
> +static __always_inline void
> +libc_feholdexcept_ppc (fenv_t *envp)
> +{
> +  fenv_union_t old, new;
> +
> +  old.fenv = *envp = fegetenv_register (); 
> +
> +  /* Clear everything except rounding modes and non-IEEE arithmetic flag.  */
> +  new.l = old.l & 0xffffffff00000007LL;

It would be nice to give this magic number mask a name. For example,
have a ROUNDING_MODES_MASK or something and then just do old.l &
~ROUNDING_MODES_MASK.

> +
> +  /* If the old env had any enabled exceptions, then mask SIGFPE in the
> +     MSR FE0/FE1 bits.  This may allow the FPU to run faster because it
> +     always takes the default action and can not generate SIGFPE. */

Two spaces after period.

> +  if ((old.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_mask_env (); 
> +
> +  fesetenv_register (new.fenv);
> +}
> +
> +static __always_inline void
> +libc_fesetround_ppc (int r)
> +{
> +  fesetround (r);
> +}
> +
> +static __always_inline void
> +libc_feholdexcept_setround_ppc (fenv_t *envp, int r)
> +{
> +  fenv_union_t old, new;
> +
> +  old.fenv = *envp = fegetenv_register (); 
> +
> +  /* Clear everything except rounding modes and non-IEEE arithmetic flag.  */
> +  new.l = (old.l & 0xffffffff00000007LL) | r;

Same here, descriptive name for a magic number.

> +
> +  if ((old.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_mask_env (); 
> +
> +  fesetenv_register (new.fenv);
> +}
> +
> +static __always_inline int
> +libc_fetestexcept_ppc (int e)
> +{
> +  fenv_union_t u;
> +  u.fenv = fegetenv_register (); 
> +  return u.l & e;
> +}
> +
> +static __always_inline void
> +libc_fesetenv_ppc (const fenv_t *envp)
> +{
> +  fenv_union_t old, new;
> +
> +  new.fenv = *envp;
> +  old.fenv = fegetenv_register ();
> +
> +  /* If the old env has no enabled exceptions and the new env has any enabled
> +     exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put the
> +     hardware into "precise mode" and may cause the FPU to run slower on some
> +     hardware.  */
> +  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_nomask_env ();

Missing space after (void).

> +
> +  /* If the old env had any enabled exceptions and the new env has no enabled
> +     exceptions, then mask SIGFPE in the MSR FE0/FE1 bits.  This may allow the
> +     FPU to run faster because it always takes the default action and can not
> +     generate SIGFPE. */
> +  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
> +    (void)__fe_mask_env ();

Likewise.

> +
> +  fesetenv_register (*envp);
> +}
> +
> +static __always_inline int
> +libc_feupdateenv_test_ppc (fenv_t *envp, int ex)
> +{
> +  fenv_union_t old, new;
> +
> +  new.fenv = *envp;
> +  old.fenv = fegetenv_register (); 
> +
> +  /* Restore rounding mode and exception enable from *envp and merge
> +     exceptions.  Leave fraction rounded/inexact and FP result/CC bits
> +     unchanged.  */
> +  new.l = (old.l & 0xffffffff1fffff00LL) | (new.l & 0x1ff80fff);
> +

Descriptive names for magic numbers

> +  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_nomask_env (); 
> +

Missing space.

> +  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
> +    (void)__fe_mask_env (); 
> +

Missing space.

> +  fesetenv_register (new.fenv);
> +
> +  return old.l & ex;
> +}
> +
> +static __always_inline void
> +libc_feupdateenv_ppc (fenv_t *e)
> +{
> +  libc_feupdateenv_test_ppc (e, 0);
> +}
> +
> +static __always_inline void
> +libc_feholdsetround_ppc (fenv_t *e, int r)
> +{
> +  fenv_union_t old, new;
> +
> +  old.fenv = fegetenv_register ();
> +  /* Clear current precision and set newer one/  */

Dot, not slash :)

> +  new.l = (old.l & ~0x3) | r;
> +  *e = old.fenv;
> +
> +  if ((old.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_mask_env (); 

Missing space.

> +  fesetenv_register (new.fenv);
> +}
> +
> +static __always_inline void
> +libc_feresetround_ppc (fenv_t *envp)
> +{
> +  fenv_union_t old, new;
> +
> +  new.fenv = *envp;
> +  old.fenv = fegetenv_register (); 
> +
> +  /* Restore rounding mode and exception enable from *envp and merge
> +     exceptions.  Leave fraction rounded/inexact and FP result/CC bits
> +     unchanged.  */
> +  new.l = (old.l & 0xffffffff1fffff00LL) | (new.l & 0x1ff80fff);
> +

Descriptive name for magic number.

> +  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
> +    (void)__fe_nomask_env (); 
> +
> +  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
> +    (void)__fe_mask_env (); 
> +
> +  /* Atomically enable and raise (if appropriate) exceptions set in `new'. */

Space after period.

> +  fesetenv_register (new.fenv);
> +}
> +
> +#define libc_feholdexceptf           libc_feholdexcept_ppc
> +#define libc_feholdexcept            libc_feholdexcept_ppc
> +#define libc_feholdexcept_setroundf  libc_feholdexcept_setround_ppc
> +#define libc_feholdexcept_setround   libc_feholdexcept_setround_ppc
> +#define libc_fetestexceptf           libc_fetestexcept_ppc
> +#define libc_fetestexcept            libc_fetestexcept_ppc
> +#define libc_fesetroundf             libc_fesetround_ppc
> +#define libc_fesetround              libc_fesetround_ppc
> +#define libc_fesetenvf               libc_fesetenv_ppc
> +#define libc_fesetenv                libc_fesetenv_ppc
> +#define libc_feupdateenv_testf       libc_feupdateenv_test_ppc
> +#define libc_feupdateenv_test        libc_feupdateenv_test_ppc
> +#define libc_feupdateenvf            libc_feupdateenv_ppc
> +#define libc_feupdateenv             libc_feupdateenv_ppc
> +#define libc_feholdsetroundf         libc_feholdsetround_ppc
> +#define libc_feholdsetround          libc_feholdsetround_ppc
> +#define libc_feresetroundf           libc_feresetround_ppc
> +#define libc_feresetround            libc_feresetround_ppc
> +
> +
> +/* We have support for rounding mode context.  */
> +#define HAVE_RM_CTX 1
> +
> +static __always_inline void
> +libc_feholdexcept_setround_ppc_ctx (struct rm_ctx *ctx, int r)
> +{
> +  fenv_union_t old, new;
> +
> +  old.fenv = fegetenv_register (); 
> +
> +  /* Clear everything except rounding modes and non-IEEE arithmetic flag.  */
> +  new.l = (old.l & 0xffffffff00000007LL) | r;

Descriptive name for magic number.

> +  ctx->env = old.fenv;
> +  if (__glibc_unlikely (new.l != old.l))
> +    {
> +      if ((old.l & _FPU_MASK_ALL) != 0)
> +	(void)__fe_mask_env (); 

Missing space.  There's also a stray whitespace at the end.

> +      fesetenv_register (new.fenv);
> +      ctx->updated_status = true;
> +    }
> +  else
> +    ctx->updated_status = false;
> +}
> +
> +static __always_inline void
> +libc_fesetenv_ppc_ctx (struct rm_ctx *ctx)
> +{
> +  libc_fesetenv_ppc (&ctx->env);
> +}
> +
> +static __always_inline void
> +libc_feupdateenv_ppc_ctx (struct rm_ctx *ctx)
> +{
> +  if (__glibc_unlikely (ctx->updated_status))
> +    libc_feupdateenv_test_ppc (&ctx->env, 0);
> +}
> +
> +static __always_inline void
> +libc_feholdsetround_ppc_ctx (struct rm_ctx *ctx, int r)
> +{
> +  fenv_union_t old, new;
> +
> +  old.fenv = fegetenv_register ();
> +  new.l = (old.l & ~0x3) | r;
> +  ctx->env = old.fenv;
> +  if (__glibc_unlikely (new.l != old.l))
> +    {
> +      if ((old.l & _FPU_MASK_ALL) != 0)
> +	(void)__fe_mask_env (); 

Missing space and stray whitespace at the end.

> +      fesetenv_register (new.fenv);
> +      ctx->updated_status = true;
> +    }
> +  else
> +    ctx->updated_status = false;
> +}
> +
> +static __always_inline void
> +libc_feresetround_ppc_ctx (struct rm_ctx *ctx)
> +{
> +  if (__glibc_unlikely (ctx->updated_status))
> +    libc_feresetround_ppc (&ctx->env);
> +}
> +
> +#define libc_feholdexcept_setroundf_ctx  libc_feholdexcept_setround_ppc_ctx
> +#define libc_feholdexcept_setround_ctx   libc_feholdexcept_setround_ppc_ctx
> +#define libc_fesetenv_ctx                libc_fesetenv_ppc_ctx
> +#define libc_fesetenvf_ctx               libc_fesetenv_ppc_ctx
> +#define libc_feholdsetround_ctx          libc_feholdsetround_ppc_ctx
> +#define libc_feholdsetroundf_ctx         libc_feholdsetround_ppc_ctx
> +#define libc_feresetround_ctx            libc_feresetround_ppc_ctx
> +#define libc_feresetroundf_ctx           libc_feresetround_ppc_ctx
> +#define libc_feupdateenvf_ctx            libc_feupdateenv_ppc_ctx
> +#define libc_feupdateenv_ctx             libc_feupdateenv_ppc_ctx
> +
> +#endif
> diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
> index 6c00785..c8833d6 100644
> --- a/sysdeps/powerpc/fpu/math_private.h
> +++ b/sysdeps/powerpc/fpu/math_private.h
> @@ -22,6 +22,7 @@
>  #include <sysdep.h>
>  #include <ldsodefs.h>
>  #include <dl-procinfo.h>
> +#include <fenv_private.h>
>  #include_next <math_private.h>
>  
>  # if __WORDSIZE == 64 || defined _ARCH_PWR4
> 

Siddhesh


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]