[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/1] Implement math.trunc and math.fast2sum
- Subject: [PATCH 1/1] Implement math.trunc and math.fast2sum
- From: "S. Gilles" <sgilles@xxxxxxxxxxxx>
- Reply-to: myrddin-dev@xxxxxxxxxxxxxx
- Date: Mon, 5 Mar 2018 03:08:56 -0500
- To: "myrddin-dev" <myrddin-dev@xxxxxxxxxxxxxx>
- Cc: "S. Gilles" <sgilles@xxxxxxxxxxxx>
In the event that Kahan's summation, or Priest's doubly-compensated
summation, are desired, Fast2Sum will be useful. Trunc/floor/ceil
are probably the simplest FP functions that require bit-poking.
---
lib/bld.sub | 1 +
lib/math/bld.sub | 5 ++
lib/math/fpmath.myr | 199 +++++++++++++++++++++++++++++++++++++++++++++++
lib/math/test/fpmath.myr | 163 ++++++++++++++++++++++++++++++++++++++
4 files changed, 368 insertions(+)
create mode 100644 lib/math/bld.sub
create mode 100644 lib/math/fpmath.myr
create mode 100644 lib/math/test/fpmath.myr
diff --git a/lib/bld.sub b/lib/bld.sub
index 97effae9..8bcceb83 100644
--- a/lib/bld.sub
+++ b/lib/bld.sub
@@ -8,6 +8,7 @@ sub =
inifile
iter
json
+ math
regex
std
sys
diff --git a/lib/math/bld.sub b/lib/math/bld.sub
new file mode 100644
index 00000000..da2e69ea
--- /dev/null
+++ b/lib/math/bld.sub
@@ -0,0 +1,5 @@
+lib math =
+ fpmath.myr
+
+ lib ../std:std
+;;
diff --git a/lib/math/fpmath.myr b/lib/math/fpmath.myr
new file mode 100644
index 00000000..8c86be98
--- /dev/null
+++ b/lib/math/fpmath.myr
@@ -0,0 +1,199 @@
+use std
+
+pkg math =
+ trait fpmath @f =
+
+ /* round floating point numbers to near, integer values */
+ trunc : (f : @f -> @f)
+ ceil : (f : @f -> @f)
+ floor : (f : @f -> @f)
+
+ /* compute (s, t) with s = round-nearest(a+b), s + t = a + b */
+ fast2sum : (a : @f, b : @f -> (@f, @f))
+ ;;
+
+ impl fpmath flt32
+ impl fpmath flt64
+;;
+
+impl fpmath flt32 =
+ trunc = {f : flt32
+ var u : uint32 = std.flt32bits(f)
+ var e : uint32 = (((u >> 23) & 0xff) : uint32) - 127
+ var e_lt_zero : uint32 = ((e >> 31) : uint32) & 0x1
+ var e_ge_zero : uint32 = 1 - e_lt_zero
+ var e_ge_23 : uint32 = 1 - ((e - 23) >> 31)
+ /*
+ The significand 1 . m1 m2 ... m23 needs to be
+ truncated, which corresponds to zeroing all mi
+ bits where i is beyond the exponent e (they are
+ the actual sub-integer portion).
+ */
+ var m : uint32 = ~(((1 << 23) - 1) >> e)
+ m |= (-1 : uint32) * e_ge_23
+ var v_ge_zero : uint32 = (u & m) * e_ge_zero
+
+ /*
+ On the other hand, if the exponent is < 0, "23
+ - e" is garbage, and we should just return +/-
+ zero
+ */
+ var v_lt_zero : uint32 = (u & (1 << 31)) * e_lt_zero
+
+ /* Try to save a branch */
+ var v : uint32 = v_ge_zero + v_lt_zero
+ -> std.flt32frombits(v)
+ }
+
+ floor = {f : flt32
+ var u : uint32 = std.flt32bits(f)
+ var e : int32 = (((u >> 23) & 0xff) : int32) - 127
+ var shift_e : uint32 = (e : uint32)
+
+ /* Many special cases */
+ if e >= 23 || u == 0x80000000
+ -> f
+ elif (e < 0) && (u & (1 << 31) != 0)
+ -> -1.0
+ elif e < 0
+ -> 0.0
+ ;;
+
+ if u & (1 << 31) != 0
+ var fractional_mask : uint32 = (((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 23) >> shift_e)
+ ;;
+ -> std.flt32frombits(v)
+ ;;
+
+ var m : uint32 = ~(((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & m
+ -> std.flt32frombits(v)
+ }
+
+ ceil = {f;
+ var u : uint32 = std.flt32bits(f)
+ var e : int32 = (((u >> 23) & 0xff) : int32) - 127
+ var shift_e : uint32 = (e : uint32)
+ if e >= 23 || u == 0x0
+ -> f
+ elif (e < 0) && (u & (1 << 31) == 0)
+ -> 1.0
+ elif e < 0
+ -> -0.0
+ ;;
+
+ if u & (1 << 31) == 0
+ var fractional_mask : uint32 = (((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 23) >> shift_e)
+ ;;
+ -> std.flt32frombits(v)
+ ;;
+
+ var m : uint32 = ~(((1 << 23) - 1) >> shift_e)
+ var v : uint32 = u & m
+ -> std.flt32frombits(v)
+ }
+
+ fast2sum = {a, b;
+ var u = std.flt32bits(a)
+ var v = std.flt32bits(b)
+ if (u & (0xff << 23)) < (v & (0xff << 23))
+ var t = a
+ a = b
+ b = t
+ ;;
+ var s = a + b
+ var z = s - a
+ var t = b - z
+ -> (s, t)
+ }
+;;
+
+impl fpmath flt64 =
+ trunc = {f : flt64
+ var u : uint64 = std.flt64bits(f)
+ var e : uint64 = (((u >> 52) & 0x7ff) : uint64) - 1023
+ var e_lt_zero : uint64 = ((e >> 63) : uint64) & 0x1
+ var e_ge_zero : uint64 = 1 - e_lt_zero
+ var e_ge_52 : uint64 = 1 - ((e - 52) >> 63)
+ var m : uint64 = ~(((1 << 52) - 1) >> e)
+ m |= (-1 : uint64) * e_ge_52
+ var v_ge_zero : uint64 = (u & m) * e_ge_zero
+ var v_lt_zero : uint64 = (u & (1 << 63)) * e_lt_zero
+ var v : uint64 = v_ge_zero + v_lt_zero
+ -> std.flt64frombits(v)
+ }
+
+ floor = {f : flt64
+ var u : uint64 = std.flt64bits(f)
+ var e : int64 = (((u >> 52) & 0x7ff) : int64) - 1023
+ var shift_e : uint64 = (e : uint64)
+
+ if e >= 52 || u == 0x8000000000000000ul
+ -> f
+ elif (e < 0) && (u & (1 << 63) != 0)
+ -> -1.0
+ elif e < 0
+ -> 0.0
+ ;;
+
+ if u & (1 << 63) != 0
+ var fractional_mask : uint64 = (((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 52) >> shift_e)
+ ;;
+ -> std.flt64frombits(v)
+ ;;
+
+ var m : uint64 = ~(((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & m
+ -> std.flt64frombits(v)
+ }
+
+ ceil = {f;
+ var u : uint64 = std.flt64bits(f)
+ var e : int64 = (((u >> 52) & 0x7ff) : int64) - 1023
+ var shift_e : uint64 = (e : uint64)
+
+ if e >= 52 || u == 0x0ul
+ -> f
+ elif (e < 0) && (u & (1 << 63) == 0)
+ -> 1.0
+ elif e < 0
+ -> -0.0
+ ;;
+
+ if u & (1 << 63) == 0
+ var fractional_mask : uint64 = (((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & ~fractional_mask
+ if (u & fractional_mask) != 0
+ v += ((1 << 52) >> shift_e)
+ ;;
+ -> std.flt64frombits(v)
+ ;;
+
+ var m : uint64 = ~(((1 << 52) - 1) >> shift_e)
+ var v : uint64 = u & m
+ -> std.flt64frombits(v)
+ }
+
+ fast2sum = {a, b;
+ var u = std.flt64bits(a)
+ var v = std.flt64bits(b)
+ if (u & (0x7ff << 52)) < (v & (0x7ff << 52))
+ var t = a
+ a = b
+ b = t
+ ;;
+ var s = a + b
+ var z = s - a
+ var t = b - z
+ -> (s, t)
+ }
+;;
diff --git a/lib/math/test/fpmath.myr b/lib/math/test/fpmath.myr
new file mode 100644
index 00000000..64de5160
--- /dev/null
+++ b/lib/math/test/fpmath.myr
@@ -0,0 +1,163 @@
+use std
+use math
+use testr
+
+const main = {
+ testr.run([
+ [.name = "trunc-01", .fn = trunc01],
+ [.name = "trunc-02", .fn = trunc02],
+ [.name = "floor-01", .fn = floor01],
+ [.name = "floor-02", .fn = floor02],
+ [.name = "ceil-01", .fn = ceil01],
+ [.name = "ceil-02", .fn = ceil02],
+ [.name = "fast2sum-01", .fn = fast2sum01],
+ ][:])
+}
+
+impl std.equatable flt32 =
+ eq = {a : flt32, b : flt32; -> std.flt32bits(a) == std.flt32bits(b)}
+;;
+
+impl std.equatable flt64 =
+ eq = {a : flt64, b : flt64; -> std.flt64bits(a) == std.flt64bits(b)}
+;;
+
+const trunc01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const trunc02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (1.0, 1.0),
+ (1.1, 1.0),
+ (0.9, 0.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (101.999, 101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.trunc(f), g)
+ ;;
+}
+
+const floor01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const floor02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 0.0),
+ (1.1, 1.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -4.0),
+ (-101.999, -102.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.floor(f), g)
+ ;;
+}
+
+const ceil01 = {c
+ var flt32s : (flt32, flt32)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (10664524000000000000.0, 10664524000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt32nan(), std.flt32nan()),
+ ][:]
+
+ for (f, g) : flt32s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const ceil02 = {c
+ var flt64s : (flt64, flt64)[:] = [
+ (0.0, 0.0),
+ (-0.0, -0.0),
+ (0.5, 1.0),
+ (-0.1, -0.0),
+ (1.1, 2.0),
+ (13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0, 13809453812721350000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.0),
+ (-3.5, -3.0),
+ (-101.999, -101.0),
+ (std.flt64nan(), std.flt64nan()),
+ ][:]
+
+ for (f, g) : flt64s
+ testr.eq(c, math.ceil(f), g)
+ ;;
+}
+
+const fast2sum01 = {c
+ var flt32s : (flt32, flt32, flt32, flt32)[:] = [
+ (1.0, 1.0, 2.0, 0.0),
+ (10664524000000000000.0, 1.11842, 10664524000000000000.0, 1.11842),
+ (1.11843, 10664524000000000000.0, 10664524000000000000.0, 1.11843),
+ (-21897.1324, 17323.22, -4573.912, 0.0),
+ ][:]
+
+ for (a, b, s1, t1) : flt32s
+ var s2, t2
+ (s2, t2) = math.fast2sum(a, b)
+ testr.eq(c, s1, s2)
+ testr.eq(c, t1, t2)
+ ;;
+
+ var flt64s : (flt64, flt64, flt64, flt64)[:] = [
+ (1.0, 1.0, 2.0, 0.0),
+ (-21897.1324, 17323.22, -4573.912399999997, 0.0),
+ (std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884),
+ std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884)),
+ (std.flt64frombits(0x6a8c3190e27a1884), std.flt64frombits(0x78591b0672a81284),
+ std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x6a8c3190e27a1884)),
+ (std.flt64frombits(0x78591b0672a81284), std.flt64frombits(0x7858273672ca19a0),
+ std.flt64frombits(0x7868a11e72b91612), 0.0),
+ ][:]
+
+ for (a, b, s1, t1) : flt64s
+ var s2, t2
+ (s2, t2) = math.fast2sum(a, b)
+ testr.eq(c, s1, s2)
+ testr.eq(c, std.flt64bits(t1), std.flt64bits(t2))
+ ;;
+}
--
2.16.2