bench: add float8 operation benchmarks

dndungu · dndungu · commit 8e0c6574e361 · 2026-03-30T07:01:19.000-07:00
Add BenchmarkFromFloat32, BenchmarkToFloat32_Modes, BenchmarkAddModes,
BenchmarkMulModes, BenchmarkSub, and BenchmarkDiv with sub-benchmarks
for algorithmic vs lookup-table paths. Record baseline results in
docs/devlog.md.
diff --git a/benchmark_test.go b/benchmark_test.go
@@ -0,0 +1,148 @@
+package float8
+
+import (
+	"testing"
+)
+
+// BenchmarkFromFloat32 benchmarks float32 → Float8 conversion.
+func BenchmarkFromFloat32(b *testing.B) {
+	b.Run("Normal", func(b *testing.B) {
+		f32 := float32(1.5)
+		for i := 0; i < b.N; i++ {
+			_ = ToFloat8(f32)
+		}
+	})
+	b.Run("Subnormal", func(b *testing.B) {
+		f32 := float32(0.001953125) // smallest normal float8 boundary
+		for i := 0; i < b.N; i++ {
+			_ = ToFloat8(f32)
+		}
+	})
+	b.Run("Zero", func(b *testing.B) {
+		f32 := float32(0.0)
+		for i := 0; i < b.N; i++ {
+			_ = ToFloat8(f32)
+		}
+	})
+	b.Run("Large", func(b *testing.B) {
+		f32 := float32(448.0) // max finite float8
+		for i := 0; i < b.N; i++ {
+			_ = ToFloat8(f32)
+		}
+	})
+}
+
+// BenchmarkToFloat32_Modes benchmarks Float8 → float32 conversion with
+// algorithmic and lookup-table paths.
+func BenchmarkToFloat32_Modes(b *testing.B) {
+	f8 := ToFloat8(1.5)
+
+	b.Run("Algorithmic", func(b *testing.B) {
+		DisableFastConversion()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = f8.ToFloat32()
+		}
+	})
+	b.Run("Lookup", func(b *testing.B) {
+		EnableFastConversion()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = f8.ToFloat32()
+		}
+		b.StopTimer()
+		DisableFastConversion()
+	})
+}
+
+// BenchmarkAddModes benchmarks addition with algorithmic and lookup-table paths.
+func BenchmarkAddModes(b *testing.B) {
+	a := ToFloat8(1.5)
+	c := ToFloat8(2.5)
+
+	b.Run("Algorithmic", func(b *testing.B) {
+		DisableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Add(a, c)
+		}
+	})
+	b.Run("Lookup", func(b *testing.B) {
+		EnableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Add(a, c)
+		}
+		b.StopTimer()
+		DisableFastArithmetic()
+	})
+}
+
+// BenchmarkMulModes benchmarks multiplication with algorithmic and lookup-table paths.
+func BenchmarkMulModes(b *testing.B) {
+	a := ToFloat8(1.5)
+	c := ToFloat8(2.5)
+
+	b.Run("Algorithmic", func(b *testing.B) {
+		DisableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Mul(a, c)
+		}
+	})
+	b.Run("Lookup", func(b *testing.B) {
+		EnableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Mul(a, c)
+		}
+		b.StopTimer()
+		DisableFastArithmetic()
+	})
+}
+
+// BenchmarkSub benchmarks subtraction.
+func BenchmarkSub(b *testing.B) {
+	a := ToFloat8(3.5)
+	c := ToFloat8(1.5)
+
+	b.Run("Algorithmic", func(b *testing.B) {
+		DisableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Sub(a, c)
+		}
+	})
+	b.Run("Lookup", func(b *testing.B) {
+		EnableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Sub(a, c)
+		}
+		b.StopTimer()
+		DisableFastArithmetic()
+	})
+}
+
+// BenchmarkDiv benchmarks division.
+func BenchmarkDiv(b *testing.B) {
+	a := ToFloat8(3.5)
+	c := ToFloat8(1.5)
+
+	b.Run("Algorithmic", func(b *testing.B) {
+		DisableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Div(a, c)
+		}
+	})
+	b.Run("Lookup", func(b *testing.B) {
+		EnableFastArithmetic()
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			_ = Div(a, c)
+		}
+		b.StopTimer()
+		DisableFastArithmetic()
+	})
+}
diff --git a/docs/devlog.md b/docs/devlog.md
@@ -0,0 +1,36 @@
+# Float8 Development Log
+
+## 2026-03-29 -- Baseline Benchmarks
+
+Recorded on Apple M4 (darwin/arm64), Go 1.25, `-benchmem -count=3`.
+
+### Conversion
+
+| Benchmark | ns/op | B/op | allocs/op |
+|-----------|------:|-----:|----------:|
+| FromFloat32/Normal | 2.50 | 0 | 0 |
+| FromFloat32/Subnormal | 2.53 | 0 | 0 |
+| FromFloat32/Zero | 0.98 | 0 | 0 |
+| FromFloat32/Large | 2.54 | 0 | 0 |
+| ToFloat32/Algorithmic | 1.36 | 0 | 0 |
+| ToFloat32/Lookup | 0.38 | 0 | 0 |
+
+### Arithmetic (Algorithmic vs Lookup)
+
+| Benchmark | Algorithmic ns/op | Lookup ns/op | Speedup |
+|-----------|------------------:|-------------:|--------:|
+| Add | 7.08 | 0.99 | 7.2x |
+| Sub | 6.91 | 0.99 | 7.0x |
+| Mul | 7.27 | 1.00 | 7.3x |
+| Div | 7.85 | 1.00 | 7.9x |
+
+### Batch Operations (1000 elements)
+
+| Benchmark | ns/op | B/op | allocs/op |
+|-----------|------:|-----:|----------:|
+| ToSlice8 | 3351 | 1024 | 1 |
+| ToSlice32 | 1592 | 4096 | 1 |
+
+All operations are zero-allocation for scalar paths. Lookup tables provide
+a consistent ~7x speedup over algorithmic arithmetic at the cost of 256 KB
+of memory (4 tables x 64K entries x 1 byte).