Floating-point composition and decomposition

Names specified here
Name	Description	Notes			Source	Availability
`copysign()`	Extract and apply sign		M	(·)	`<tgmath.h>`				C99	C11
`copysign()`	Extract and apply sign			(·)	`<math.h>`				C99	C11
`copysignf()`	Extract and apply sign			(·)	`<math.h>`				C99	C11
`copysignl()`	Extract and apply sign			(·)	`<math.h>`				C99	C11
`FP_ILOGB0`	Result of logarithm of zero		M		`<math.h>`				C99	C11
`FP_ILOGBNAN`	Result of logarithm of NaN		M		`<math.h>`				C99	C11
`frexp()`	Decompose floating-point number			(·)	`<math.h>`	C89	C90	C95	C99	C11
`frexp()`	Decompose floating-point number		M	(·)	`<tgmath.h>`				C99	C11
`frexpf()`	Decompose floating-point number			(·)	`<math.h>`				C99	C11
`frexpl()`	Decompose floating-point number			(·)	`<math.h>`				C99	C11
`ilogb()`	Compute native-base logarithm			(·)	`<math.h>`				C99	C11
`ilogb()`	Compute native-base logarithm		M	(·)	`<tgmath.h>`				C99	C11
`ilogbf()`	Compute native-base logarithm			(·)	`<math.h>`				C99	C11
`ilogbl()`	Compute native-base logarithm			(·)	`<math.h>`				C99	C11
`ldexp()`	Multiply by power of 2		M	(·)	`<tgmath.h>`				C99	C11
`ldexp()`	Multiply by power of 2			(·)	`<math.h>`	C89	C90	C95	C99	C11
`ldexpf()`	Multiply by power of 2			(·)	`<math.h>`				C99	C11
`ldexpl()`	Multiply by power of 2			(·)	`<math.h>`				C99	C11
`modf()`	Extract integral and fractional parts	L		(·)	Predefined				C99	C11
`modf()`	Extract integral and fractional parts			(·)	`<math.h>`	C89	C90	C95	C99	C11
`modff()`	Extract integral and fractional parts			(·)	`<math.h>`				C99	C11
`modfl()`	Extract integral and fractional parts			(·)	`<math.h>`				C99	C11
`signbit()`	Test for negative		M	(·)	`<math.h>`				C99	C11

<math.h> provides some functions for decomposing floating-point numbers into parts, and for building them up from parts. Some functions can be used to decompose into integers (which are then trivial to serialize for transmission), and then recombine them later.

#include <math.h>
int ilogbf(float x);
int ilogb(double x);
int ilogbl(long double x);

#include <tgmath.h>
int ilogb(real-floating-type x);

The ilogb functions extract the exponent of x as an integer, and return:

`FP_ILOGB0`	when `x` is zero
`INT_MAX`	when `x` is infinite
`FP_ILOGBNAN`	when `x` is NaN
the exponent	otherwise

#include <math.h>
float frexpf(float v, int *xp);
double frexp(double v, int *xp);
long double frexpl(long double v, int *xp);

#include <tgmath.h>
real-floating-type frexp(real-floating-type v, int *xp);

The frexp functions split the input v into two parts: an exponent x written to *exp; and a mantissa m which is returned, and has the range [½, 1]. The two outputs can be used to reform the input, such that v=m.2^x. If v is zero, then *xp and the return value will also be zero.

#include <math.h>
float ldexpf(float m, int x);
double ldexp(double m, int x);
long double ldexpl(long double m, int x);

#include <tgmath.h>
real-floating-type ldexp(real-floating-type m, int x);

The ldexp functions compute and return m.2^x.

#include <math.h>
float modff(float val, float *ip);
double modf(double val, double *ip);
long double modfl(long double val, long double *ip);

#include <tgmath.h>
real-floating-type modf(real-floating-type val, real-floating-type *ip);

The modf functions split the number val into an integer part, stored in *ip, and a fractional part, which is returned. Both parts have the same sign as val.

#include <math.h>
float copysignf(float x, float y);
double copysign(double x, double y);
long double copysignl(long double x, long double y);

#include <tgmath.h>
real-floating-type copysign(real-floating-type x, real-floating-type y);

The copysign functions return a value which has the magnitude of x and the sign of y. If x is NaN, the result is also NaN, with the sign of y.

You can use frexp and ldexp to split a floating-point value into a mantissa and exponent for interoperable representation as integers. Here are two functions ieee754_bin64enc and ieee754_bin64dec to convert between double and uint_fast64_t holding an IEEE754 binary64 value:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>

/* IEEE754 binary64 has 52 mantissa bits, 1 sign bit, and 11 bits for
   the exponent. */

#define MANTISSA_BIT 52
#define EXPONENT_BIT 11
#define SIGN_BIT 1

#define MANTISSA_SHIFT 0
#define MANTISSA_MASK ((UINTMAX_C(1) << MANTISSA_BIT) - 1u)
#define MANTISSA_FIELD (MANTISSA_MASK << MANTISSA_SHIFT)

#define EXPONENT_SHIFT (MANTISSA_SHIFT + MANTISSA_BIT)
#define EXPONENT_MASK ((UINTMAX_C(1) << EXPONENT_BIT) - 1u)
#define EXPONENT_FIELD (EXPONENT_MASK << EXPONENT_SHIFT)

#define SIGN_SHIFT (EXPONENT_SHIFT + EXPONENT_BIT)
#define SIGN_MASK ((UINTMAX_C(1) << SIGN_BIT) - 1u)
#define SIGN_FIELD (SIGN_MASK << SIGN_SHIFT)

uint_fast64_t ieee754_bin64enc(double input)
{
  double d_mant;
  unsigned u_exp;
  unsigned long long u_mant;
  bool neg;
  int s_exp;
  uint_fast64_t bytes;

  switch (fpclassify(input)) {
  case FP_ZERO:
    neg = signbit(input);
    u_mant = 0;
    u_exp = 0;
    break;

  case FP_INFINITE:
    neg = signbit(input);
    u_mant = 0;
    u_exp = 0x7ffu;
    break;

  case FP_NAN:
    neg = false;
    u_mant = 1;
    u_exp = 0x7ffu;
    break;

  case FP_SUBNORMAL:
  case FP_NORMAL:
    /* Handle normal and subnormal together.  The number might be one
       class for double, but another for binary64. */

    /* Decompose the input into a significand (mantissa + 1) and an
       exponent. */
    d_mant = frexp(input, &s_exp);

    /* Extract the sign bit from the mantissa. */
    neg = signbit(input);
    d_mant = fabs(d_mant);

    /* Offset the exponent so it can be represented as an unsigned
       value. */
    s_exp += 1022;

    /* Now we find out whether the number we represent is normal,
       subnormal, or overflows binary64. */
    if (s_exp >= 0x7ff) {
      /* The number is too big for binary64, so use the maximum
         value. */
      u_mant = MANTISSA_MASK;
      u_exp = 0x7feu;
    } else if (s_exp <= 0) {
      /* The number is subnormal in binary64. */

      /* Shift the mantissa so that its exponent would be 0. */
      u_mant = ldexp(d_mant, MANTISSA_BIT);

      u_mant >>= -s_exp;
      u_exp = 0;
    } else {
      /* The number is normal in binary64. */

      /* Use the suggested exponent. */
      u_exp = s_exp;

      /* Make the mantissa value into a positive integer. */
      u_mant = ldexp(d_mant, MANTISSA_BIT + 1);
    }

    break;
  }

  /* Transmit the bottom MANTISSA_BITs of u_mant.  The extra top bit
     will always be one because of normalization. */

  bytes = ((uint_fast64_t) u_mant & MANTISSA_MASK) << MANTISSA_SHIFT;
  bytes |= ((uint_fast64_t) u_exp & EXPONENT_MASK) << EXPONENT_SHIFT;
  bytes |= ((uint_fast64_t) neg & SIGN_MASK) << SIGN_SHIFT;

  return bytes;
}


double ieee754_bin64dec(uint_fast64_t bytes)
{
  double output;
  int s_exp;
  unsigned u_exp;
  unsigned long long u_mant;
  bool neg;

  /* Extract the bit fields. */
  u_exp = (bytes >> EXPONENT_SHIFT) & EXPONENT_MASK;
  u_mant = (bytes >> MANTISSA_SHIFT) & MANTISSA_MASK;
  neg = (bytes >> SIGN_SHIFT) & SIGN_MASK;

  if (u_exp == 0x7ffu) {
    if (u_mant == 0) {
#ifdef INFINITY
      return neg ? -INFINITY : +INFINITY;
#else
      return neg ? -DBL_MAX : +DBL_MAX;
#endif
    }
    return NAN;
  }

  if (u_exp == 0) {
    if (u_mant == 0)
      return neg ? -0.0 : +0.0;

    /* Subnormal value */

    /* Multiply the mantissa by a power of two. */
    output = ldexp(u_mant, -(MANTISSA_BIT + 1022));

    if (neg)
      output = -output;

    return output;
  }

  /* Recover the top bit of the mantissa. */
  u_mant |= MANTISSA_MASK + 1;

  /* Convert offset exponent back into a native signed value. */
  s_exp = (int) u_exp - 1022;

  /* Multiply the mantissa by a power of two. */
  output = ldexp(u_mant, s_exp - (MANTISSA_BIT + 1));

  if (neg)
    output = -output;

  return output;
}

int main(int argc, const char *argv[])
{
  double input = atof(argv[1]);
  union {
    double dbl;
    unsigned char bytes[sizeof(double)];
  } swap;
  uint_fast64_t bytes = ieee754_bin64enc(input);
  double output = ieee754_bin64dec(bytes);

  printf("input = %g\n", input);
  printf("format: %04x %04x %04x %04x\n",
         (unsigned) (bytes >> 48) & 0xffff,
         (unsigned) (bytes >> 32) & 0xffff,
         (unsigned) (bytes >> 16) & 0xffff,
         (unsigned) (bytes >> 0) & 0xffff);

  if (sizeof(double) == 8) {
    swap.dbl = input;
    printf("union: %02x%02x %02x%02x %02x%02x %02x%02x\n",
           swap.bytes[7],
           swap.bytes[6],
           swap.bytes[5],
           swap.bytes[4],
           swap.bytes[3],
           swap.bytes[2],
           swap.bytes[1],
           swap.bytes[0]);
  }

  printf("output = %g\n", output);

  printf("diff = %g\n", (output - input));

  return EXIT_SUCCESS;
}

ldexp with a negative power will restore the mantissa to a floating-point value, then another ldexp will incorporate the exponent (hence, they are combined in the code above).