From e9956ac9272551cccd35b9ddc863f5941843ab53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 4 Jun 2026 13:27:45 -0700
Subject: [PATCH 1/7] A2FastModel: allocate layer history buffers from a single
 arena

Previously each of the 23 layers held its own std::vector<float> history.
Heap allocators can place these at addresses that share the same L1 cache
sets, causing conflict misses on the wide-dilation layers whose tap reads,
_layer_in writes, and head-history writes simultaneously compete for cache.

Replace with a single contiguous allocation (_history_arena) from which
each layer takes a raw pointer slice. Sequential placement naturally
distributes base addresses across cache sets because each slot size mod
the set stride is non-zero.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-Authored-By: Landon McCoy <landon-chaos@users.noreply.github.com>
---
 NAM/wavenet/a2_fast.cpp | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)
diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 8fd8fc1..016d990 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -91,7 +91,8 @@ class A2FastModel : public DSP
     std::array<float, Channels> l1x1_b{};
 
     // Conv1D input history ring buffer, column-major (Channels rows).
-    std::vector<float> history;
+    // Points into A2FastModel::_history_arena; do not free.
+    float* history = nullptr;
   #if NAM_A2_RING_MODE == 1
     // pow2 ring + tail mirror. Storage = (pow2_size + max_buffer_size) cols.
     // write_pos is kept in [0, pow2_size), reads use (pos & pow2_mask) and are
@@ -133,6 +134,12 @@ class A2FastModel : public DSP
   int _head_write_pos = 0;
   #endif
 
+  // Single contiguous allocation for all 23 layers' history buffers.
+  // Placing them in one arena rather than 23 separate vectors distributes
+  // their base addresses across cache sets, reducing L1 set-conflict misses
+  // on narrow caches (e.g. Cortex-A8 with 16KB / 4-way = 4KB set stride).
+  std::vector<float> _history_arena;
+
   // Working buffers (all Channels rows, max_buffer_size cols, col-major).
   std::vector<float> _layer_in; // current layer input / next layer input (in-place residual)
   std::vector<float> _head_sum; // accumulates activations across all layers
@@ -298,18 +305,29 @@ void A2FastModel<Channels>::SetMaxBufferSize(int maxBufferSize)
   _cond.assign(static_cast<size_t>(maxBufferSize), 0.0f);
   _head_out.assign(static_cast<size_t>(maxBufferSize), 0.0f);
 
-  for (auto& L : _layers)
+  // Compute per-layer slot sizes, then allocate one contiguous arena.
+  std::array<size_t, kNumLayers> slot_sizes{};
+  size_t arena_total = 0;
+  for (int i = 0; i < kNumLayers; i++)
   {
+    Layer& L = _layers[i];
   #if NAM_A2_RING_MODE == 1
     L.pow2_size = next_pow2(L.max_lookback + maxBufferSize);
     L.pow2_mask = L.pow2_size - 1;
-    L.history.assign(static_cast<size_t>(Channels) * (L.pow2_size + maxBufferSize), 0.0f);
-    L.write_pos = L.max_lookback;
+    slot_sizes[i] = static_cast<size_t>(Channels) * (L.pow2_size + maxBufferSize);
   #else
     L.history_cols = 2 * L.max_lookback + maxBufferSize;
-    L.history.assign(static_cast<size_t>(Channels) * L.history_cols, 0.0f);
-    L.write_pos = L.max_lookback;
+    slot_sizes[i] = static_cast<size_t>(Channels) * L.history_cols;
   #endif
+    L.write_pos = L.max_lookback;
+    arena_total += slot_sizes[i];
+  }
+  _history_arena.assign(arena_total, 0.0f);
+  size_t arena_offset = 0;
+  for (int i = 0; i < kNumLayers; i++)
+  {
+    _layers[i].history = _history_arena.data() + arena_offset;
+    arena_offset += slot_sizes[i];
   }
 
   const int head_lookback = kHeadKernelSize - 1;
@@ -338,7 +356,7 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
 {
   #if NAM_A2_RING_MODE == 1
   const int mbs = GetMaxBufferSize();
-  float* const hist = L.history.data();
+  float* const hist = L.history;
   const float* const src = _layer_in.data();
   const int wp = L.write_pos;
   const int first = std::min(num_frames, L.pow2_size - wp);
@@ -355,11 +373,11 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
   if (L.write_pos + num_frames > L.history_cols)
   {
     const int keep = L.max_lookback;
-    std::memmove(L.history.data(), L.history.data() + static_cast<size_t>(L.write_pos - keep) * Channels,
+    std::memmove(L.history, L.history + static_cast<size_t>(L.write_pos - keep) * Channels,
                  static_cast<size_t>(keep) * Channels * sizeof(float));
     L.write_pos = keep;
   }
-  std::memcpy(L.history.data() + static_cast<size_t>(L.write_pos) * Channels, _layer_in.data(),
+  std::memcpy(L.history + static_cast<size_t>(L.write_pos) * Channels, _layer_in.data(),
               static_cast<size_t>(num_frames) * Channels * sizeof(float));
   L.write_pos += num_frames;
   #endif

From 5961e7960cf0a52306bdf5a8e406213f07a52ba6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 4 Jun 2026 13:30:31 -0700
Subject: [PATCH 2/7] A2FastModel: write head activations directly to
 _head_history

Previously each layer accumulated its post-activation output into a
separate _head_sum buffer, which was then zeroed before every block
(memset of Channels*num_frames floats) and copied into _head_history
by _head_ring_write (another memcpy of the same size) after all layers
finished.

Replace with direct writes: layer 0 assigns (IsFirst=true template
parameter), layers 1-22 accumulate (IsFirst=false). The IsFirst branch
resolves at compile time, so there is no runtime branch in the hot loop.

_head_ring_write is removed; process() now manages the head ring write
position directly, rewinding with a cheap memmove of kHeadKernelSize-1
columns when the end of the buffer is reached.

Also eliminates ztile.setZero() for the Channels=8 Eigen path: tap 0
now assigns into ztile (noalias() =) rather than accumulating on top of
a zero-initialised matrix, saving one Channels*num_frames write per layer.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-Authored-By: Landon McCoy <landon-chaos@users.noreply.github.com>
---
 NAM/wavenet/a2_fast.cpp | 131 +++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 61 deletions(-)

diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 016d990..ceb67bf 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -142,7 +142,6 @@ class A2FastModel : public DSP
 
   // Working buffers (all Channels rows, max_buffer_size cols, col-major).
   std::vector<float> _layer_in; // current layer input / next layer input (in-place residual)
-  std::vector<float> _head_sum; // accumulates activations across all layers
   std::vector<float> _z; // per-layer conv output accumulator (tap-major)
   std::vector<float> _cond; // float32 copy of the double NAM_SAMPLE input, reused each block
   std::vector<float> _head_out; // float32 head output before writing to NAM_SAMPLE
@@ -151,15 +150,16 @@ class A2FastModel : public DSP
 
   void _load_weights(std::vector<float>& weights);
   void _ring_write(Layer& L, int num_frames);
-  void _head_ring_write(int num_frames);
-  void _layer_forward(int layer_idx, const float* cond, int num_frames);
+  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, int head_wp);
   void _head_forward(float* output, int num_frames);
 
   // Compile-time-specialized per-layer kernel. KernelSize is lifted to a
   // template parameter so clang can fully unroll the tap loop and schedule
   // FMAs across taps. For the A2 shape we only need K=6 and K=15.
-  template <int KernelSize>
-  void _layer_forward_k(Layer& L, const float* cond, int num_frames);
+  // IsFirst: true for layer 0 only. Selects = vs += when writing the head
+  // accumulator directly into _head_history, avoiding a separate zeroing pass.
+  template <int KernelSize, bool IsFirst>
+  void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp);
 };
 
 // -----------------------------------------------------------------------------
@@ -300,7 +300,6 @@ void A2FastModel<Channels>::SetMaxBufferSize(int maxBufferSize)
   DSP::SetMaxBufferSize(maxBufferSize);
 
   _layer_in.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
-  _head_sum.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
   _z.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
   _cond.assign(static_cast<size_t>(maxBufferSize), 0.0f);
   _head_out.assign(static_cast<size_t>(maxBufferSize), 0.0f);
@@ -383,50 +382,18 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
   #endif
 }
 
-template <int Channels>
-void A2FastModel<Channels>::_head_ring_write(int num_frames)
-{
-  #if NAM_A2_RING_MODE == 1
-  const int mbs = GetMaxBufferSize();
-  float* const hist = _head_history.data();
-  const float* const src = _head_sum.data();
-  const int wp = _head_write_pos;
-  const int first = std::min(num_frames, _head_pow2_size - wp);
-  std::memcpy(hist + static_cast<size_t>(wp) * Channels, src, static_cast<size_t>(first) * Channels * sizeof(float));
-  if (first < num_frames)
-  {
-    std::memcpy(hist, src + static_cast<size_t>(first) * Channels,
-                static_cast<size_t>(num_frames - first) * Channels * sizeof(float));
-  }
-  std::memcpy(
-    hist + static_cast<size_t>(_head_pow2_size) * Channels, hist, static_cast<size_t>(mbs) * Channels * sizeof(float));
-  _head_write_pos = (wp + num_frames) & _head_pow2_mask;
-  #else
-  const int keep = kHeadKernelSize - 1;
-  if (_head_write_pos + num_frames > _head_history_cols)
-  {
-    std::memmove(_head_history.data(), _head_history.data() + static_cast<size_t>(_head_write_pos - keep) * Channels,
-                 static_cast<size_t>(keep) * Channels * sizeof(float));
-    _head_write_pos = keep;
-  }
-  std::memcpy(_head_history.data() + static_cast<size_t>(_head_write_pos) * Channels, _head_sum.data(),
-              static_cast<size_t>(num_frames) * Channels * sizeof(float));
-  _head_write_pos += num_frames;
-  #endif
-}
-
 // -----------------------------------------------------------------------------
 // Per-layer forward pass. Reads current _layer_in, writes back into _layer_in
 // after applying dilated conv + mixin + LeakyReLU + layer1x1 residual, and
-// accumulates activations into _head_sum.
+// writes/accumulates activations directly into _head_history at head_wp.
 // -----------------------------------------------------------------------------
 // Compile-time-specialized per-layer kernel. KernelSize is a template param
 // so the K tap loop + per-tap weight offsets become compile-time constants;
 // clang fully unrolls and can schedule FMAs across taps. Called from the
 // runtime dispatcher below for each A2 kernel size (6 and 15).
 template <int Channels>
-template <int KernelSize>
-void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int num_frames)
+template <int KernelSize, bool IsFirst>
+void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp)
 {
   constexpr int K = KernelSize;
   const int D = L.dilation;
@@ -557,11 +524,17 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
       a0 = (a0 >= 0.0f) ? a0 : a0 * kLeakySlope;
       a1 = (a1 >= 0.0f) ? a1 : a1 * kLeakySlope;
       a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope;
-      // Head sum accumulate.
-      float* hsum = &_head_sum[static_cast<size_t>(f) * 3];
-      hsum[0] += a0;
-      hsum[1] += a1;
-      hsum[2] += a2;
+      // Write/accumulate directly into _head_history at the current ring slot.
+      float* hslot = &_head_history[static_cast<size_t>(head_wp + f) * 3];
+      if constexpr (IsFirst) {
+        hslot[0] = a0;
+        hslot[1] = a1;
+        hslot[2] = a2;
+      } else {
+        hslot[0] += a0;
+        hslot[1] += a1;
+        hslot[2] += a2;
+      }
       // layer1x1 residual.
       float* lin = &_layer_in[static_cast<size_t>(f) * 3];
       lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
@@ -596,25 +569,29 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
     Eigen::Map<const RowDyn> cond_row(cond, 1, num_frames);
 
     Eigen::Map<MatCDyn> ztile(_z.data(), Channels, num_frames);
-    Eigen::Map<MatCDyn> hsum_block(_head_sum.data(), Channels, num_frames);
+    Eigen::Map<MatCDyn> hslot_block(&_head_history[static_cast<size_t>(head_wp) * Channels], Channels, num_frames);
     Eigen::Map<MatCDyn> lin_block(_layer_in.data(), Channels, num_frames);
 
-    ztile.setZero();
-
-    // Conv: one 8x8 × 8xN GEMM per tap.
+    // Conv: one 8x8 × 8xN GEMM per tap. Tap 0 assigns into ztile (no setZero needed).
     for (int k = 0; k < K; k++)
     {
       const int tap_base = tap_base_phys(K - 1 - k);
       Eigen::Map<const MatCC> W(&L.conv_w[static_cast<size_t>(k) * Channels * Channels]);
       Eigen::Map<const MatCDyn> input_block(&L.history[static_cast<size_t>(tap_base) * Channels], Channels, num_frames);
-      ztile.noalias() += W * input_block;
+      if (k == 0)
+        ztile.noalias() = W * input_block;
+      else
+        ztile.noalias() += W * input_block;
     }
 
-    // Post-conv: bias, mixin, LeakyReLU, head_sum, 1x1 residual — all block ops.
+    // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual.
     ztile.colwise() += conv_b_vec;
     ztile.noalias() += mixin_vec * cond_row; // rank-1 outer product
     ztile = (ztile.array() < 0.0f).select(ztile.array() * kLeakySlope, ztile.array());
-    hsum_block += ztile;
+    if constexpr (IsFirst)
+      hslot_block = ztile;
+    else
+      hslot_block += ztile;
     lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
     lin_block.colwise() += l1x1_b_vec;
   }
@@ -624,14 +601,21 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
 // For the A2 shape the detector only admits K in {6, 15}; any other value
 // here means something passed the detector that shouldn't have.
 template <int Channels>
-void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int num_frames)
+void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first,
+                                           int head_wp)
 {
   Layer& L = _layers[layer_idx];
   _ring_write(L, num_frames);
   switch (L.kernel_size)
   {
-    case 6: _layer_forward_k<6>(L, cond, num_frames); break;
-    case 15: _layer_forward_k<15>(L, cond, num_frames); break;
+    case 6:
+      if (is_first) _layer_forward_k<6, true>(L, cond, num_frames, head_wp);
+      else          _layer_forward_k<6, false>(L, cond, num_frames, head_wp);
+      break;
+    case 15:
+      if (is_first) _layer_forward_k<15, true>(L, cond, num_frames, head_wp);
+      else          _layer_forward_k<15, false>(L, cond, num_frames, head_wp);
+      break;
     default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size));
   }
 }
@@ -639,10 +623,11 @@ void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int
 // -----------------------------------------------------------------------------
 // Head: K=16 dilation-1 conv from Channels to 1, plus bias + scale.
 // -----------------------------------------------------------------------------
+// process() writes all 23 layers' activations directly into _head_history
+// before calling this function, and has already advanced _head_write_pos.
 template <int Channels>
 void A2FastModel<Channels>::_head_forward(float* output, int num_frames)
 {
-  _head_ring_write(num_frames);
   #if NAM_A2_RING_MODE == 1
   const int mask = _head_pow2_mask;
   auto col_of = [&](int f, int k) { return (_head_write_pos - num_frames + f - (kHeadKernelSize - 1 - k)) & mask; };
@@ -690,11 +675,35 @@ void A2FastModel<Channels>::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int
       lin[c] = _rechannel_w[c] * x;
   }
 
-  // Zero head accumulator.
-  std::memset(_head_sum.data(), 0, static_cast<size_t>(num_frames) * Channels * sizeof(float));
+  // Advance the head ring write position. Rewind if writing num_frames more
+  // would overflow the contiguous range, preserving the K-1 lookback window.
+  const int head_keep = kHeadKernelSize - 1;
+  #if NAM_A2_RING_MODE == 1
+  const int head_cap = _head_pow2_size;
+  #else
+  const int head_cap = _head_history_cols;
+  #endif
+  if (_head_write_pos + num_frames > head_cap)
+  {
+    std::memmove(_head_history.data(),
+                 _head_history.data() + static_cast<size_t>(_head_write_pos - head_keep) * Channels,
+                 static_cast<size_t>(head_keep) * Channels * sizeof(float));
+    _head_write_pos = head_keep;
+  }
+  const int head_wp = _head_write_pos;
 
-  for (int li = 0; li < kNumLayers; li++)
-    _layer_forward(li, cond, num_frames);
+  // Layer forward: layer 0 assigns into _head_history (IsFirst=true),
+  // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed.
+  _layer_forward(0, cond, num_frames, /*is_first=*/true, head_wp);
+  for (int li = 1; li < kNumLayers; li++)
+    _layer_forward(li, cond, num_frames, /*is_first=*/false, head_wp);
+
+  // Advance the write position past this buffer's worth of frames.
+  #if NAM_A2_RING_MODE == 1
+  _head_write_pos = (head_wp + num_frames) & _head_pow2_mask;
+  #else
+  _head_write_pos = head_wp + num_frames;
+  #endif
 
   // Output.
   float* head_out = _head_out.data();

From a77967807a68a9fc6b6183689d9e3aee39754bdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 4 Jun 2026 13:48:45 -0700
Subject: [PATCH 3/7] A2FastModel: mirror-on-demand for per-layer ring writes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously _ring_write always refreshed the full tail mirror — a memcpy
of maxBufferSize * Channels * sizeof(float) bytes per layer per block
(4 KB for Ch=8 / 128 frames, 92 KB total across 23 layers per buffer).

Most of the time no tap read crosses the pow2_size boundary, so the
mirror copy was pure overhead. Replace with mirror-on-demand: after each
write, compute each tap's next-call read range and mirror only the
columns that actually overflow past pow2_size. Zero copy is emitted
whenever no tap straddles the wrap.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-Authored-By: Landon McCoy <landon-chaos@users.noreply.github.com>
---
 NAM/wavenet/a2_fast.cpp | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index ceb67bf..800a6c1 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -344,17 +344,18 @@ void A2FastModel<Channels>::SetMaxBufferSize(int maxBufferSize)
 
 // -----------------------------------------------------------------------------
 // Ring-write helpers.
-//   Mode 1: pow2 + tail mirror. Constant-time per block (one short memcpy
-//   into the ring, one mirror refresh).
-//   Mode 0: linear with periodic memmove rewind. When write_pos nears the
-//   end of history, memmove the trailing max_lookback cols back to offset 0
-//   and reset write_pos. That memmove is the jitter spike we're measuring.
+//   Mode 1: pow2 + tail mirror, mirror-on-demand. Reads in _layer_forward_k
+//   use `tap_base + f` without masking, so the mirror region at
+//   [pow2_size, pow2_size + mbs) must cover any reads that overflow past
+//   pow2_size. After each write, predict where the next call's tap reads will
+//   land and mirror only the columns that actually overflow — frequently zero.
+//   Avoids the full mbs*Channels*4 memcpy that a naive refresh would do.
+//   Mode 0: linear with periodic memmove rewind.
 // -----------------------------------------------------------------------------
 template <int Channels>
 void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
 {
   #if NAM_A2_RING_MODE == 1
-  const int mbs = GetMaxBufferSize();
   float* const hist = L.history;
   const float* const src = _layer_in.data();
   const int wp = L.write_pos;
@@ -365,9 +366,26 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
     std::memcpy(hist, src + static_cast<size_t>(first) * Channels,
                 static_cast<size_t>(num_frames - first) * Channels * sizeof(float));
   }
-  std::memcpy(
-    hist + static_cast<size_t>(L.pow2_size) * Channels, hist, static_cast<size_t>(mbs) * Channels * sizeof(float));
-  L.write_pos = (wp + num_frames) & L.pow2_mask;
+  const int new_wp = (wp + num_frames) & L.pow2_mask;
+
+  // Mirror-on-demand: for each tap, predict the read range in the next call
+  // and mirror only the columns that would overflow past pow2_size.
+  int mirror_needed = 0;
+  const int K = L.kernel_size;
+  const int D = L.dilation;
+  for (int k = 0; k < K; k++)
+  {
+    const int taps_back = K - 1 - k;
+    const int tap_base = (new_wp - num_frames - taps_back * D) & L.pow2_mask;
+    const int read_end = tap_base + num_frames - 1;
+    if (read_end >= L.pow2_size)
+      mirror_needed = std::max(mirror_needed, read_end - L.pow2_size + 1);
+  }
+  if (mirror_needed > 0)
+    std::memcpy(hist + static_cast<size_t>(L.pow2_size) * Channels, hist,
+                static_cast<size_t>(mirror_needed) * Channels * sizeof(float));
+
+  L.write_pos = new_wp;
   #else
   if (L.write_pos + num_frames > L.history_cols)
   {

From 6c2815df386917d49b2990cb5bc8d069f1e59148 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 4 Jun 2026 13:51:50 -0700
Subject: [PATCH 4/7] A2FastModel: portable T=4 frame-tiled conv for Channels
 >= 4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the Eigen noalias() GEMM loop with an explicit T=4 frame-tiled,
tap-major C++ kernel. Structure of the inner body:

    a[f][o] += Wcol[o] * h[f]     (o = output channel, f = frame in tile)

Wcol = W[:,cp] is stride-1 in o, so the o loop vectorizes. h[f] is a
scalar loaded from the (column-major) history buffer, so the compiler
emits a broadcast-scalar FMA instruction — vmlaq_n_f32 on AArch64, the
AVX equivalent on x86 — without requiring architecture-specific intrinsics.

The T=4 tiling amortizes the two weight register loads (W[:,cp] = 8
floats = 2 Q-regs on AArch64) across four independent accumulator chains,
matching the weight-reuse strategy of the explicit NEON microkernel.

A scalar tail handles buffer sizes that are not multiples of 4; in
practice audio buffer sizes (64, 128, 256) are always multiples of 4.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-Authored-By: Landon McCoy <landon-chaos@users.noreply.github.com>
---
 NAM/wavenet/a2_fast.cpp | 70 +++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 800a6c1..11d7978 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -590,16 +590,68 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
     Eigen::Map<MatCDyn> hslot_block(&_head_history[static_cast<size_t>(head_wp) * Channels], Channels, num_frames);
     Eigen::Map<MatCDyn> lin_block(_layer_in.data(), Channels, num_frames);
 
-    // Conv: one 8x8 × 8xN GEMM per tap. Tap 0 assigns into ztile (no setZero needed).
-    for (int k = 0; k < K; k++)
+    // Conv: T=4 frame-tiled, tap-major.
+    //
+    // For each tile of T=4 frames, accumulate all K taps and all Channels input
+    // channels into T*Channels output accumulators kept in local storage. The
+    // inner loop body is:
+    //
+    //   a[f][o] += Wcol[o] * h[f]   (o vectorized, h[f] scalar broadcast)
+    //
+    // where Wcol = W[:,cp] is stride-1 in o and h[f] = history[col+f][cp] is a
+    // scalar. On AArch64 the compiler emits vmlaq_n_f32 (SIMD FMA with scalar
+    // broadcast) with weight vector Wcol reused across all T frames — the same
+    // weight amortisation as an explicit NEON microkernel, without intrinsics.
+    // Equivalent SIMD broadcast-FMA instructions are emitted on x86 (AVX) and
+    // other SIMD targets automatically.
     {
-      const int tap_base = tap_base_phys(K - 1 - k);
-      Eigen::Map<const MatCC> W(&L.conv_w[static_cast<size_t>(k) * Channels * Channels]);
-      Eigen::Map<const MatCDyn> input_block(&L.history[static_cast<size_t>(tap_base) * Channels], Channels, num_frames);
-      if (k == 0)
-        ztile.noalias() = W * input_block;
-      else
-        ztile.noalias() += W * input_block;
+      float* z = _z.data();
+      constexpr int T = 4;
+      const int nF4 = (num_frames / T) * T;
+
+      for (int f = 0; f < nF4; f += T)
+      {
+        float a[T][Channels]{};  // zero-init; register-allocated by compiler
+
+        for (int k = 0; k < K; k++)
+        {
+          const float* W = &L.conv_w[static_cast<size_t>(k) * Channels * Channels];
+          const float* hb = L.history + static_cast<size_t>(tap_base_phys(K - 1 - k) + f) * Channels;
+          for (int cp = 0; cp < Channels; cp++)
+          {
+            const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o
+            const float h0 = hb[cp],                   h1 = hb[Channels + cp],
+                         h2 = hb[2 * Channels + cp],   h3 = hb[3 * Channels + cp];
+            for (int o = 0; o < Channels; o++)
+            {
+              a[0][o] += Wcol[o] * h0;
+              a[1][o] += Wcol[o] * h1;
+              a[2][o] += Wcol[o] * h2;
+              a[3][o] += Wcol[o] * h3;
+            }
+          }
+        }
+        for (int ti = 0; ti < T; ti++)
+          std::memcpy(z + static_cast<size_t>(f + ti) * Channels, a[ti], Channels * sizeof(float));
+      }
+
+      // Scalar tail for any frames past the T-aligned boundary.
+      for (int f = nF4; f < num_frames; f++)
+      {
+        float* zf = z + static_cast<size_t>(f) * Channels;
+        for (int o = 0; o < Channels; o++) zf[o] = 0.0f;
+        for (int k = 0; k < K; k++)
+        {
+          const float* W = &L.conv_w[static_cast<size_t>(k) * Channels * Channels];
+          const float* h = L.history + static_cast<size_t>(tap_base_phys(K - 1 - k) + f) * Channels;
+          for (int cp = 0; cp < Channels; cp++)
+          {
+            const float hv = h[cp];
+            const float* Wcol = W + cp * Channels;
+            for (int o = 0; o < Channels; o++) zf[o] += Wcol[o] * hv;
+          }
+        }
+      }
     }
 
     // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual.

From 78f92a225e4ef7db89d6a6c8afc93e42d020e581 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Mon, 8 Jun 2026 10:45:46 -0700
Subject: [PATCH 5/7] Dead terminal-residual elimination optimization + update
 to CMakeLists.txt to compile to Release by default.

---
 CMakeLists.txt          |  7 +++++++
 NAM/wavenet/a2_fast.cpp | 46 ++++++++++++++++++++++++++---------------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f9fe79..a5bc221 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,13 @@ project(NAM VERSION 0.4.0)
 
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
+# Default to a Release build: without this, an empty CMAKE_BUILD_TYPE compiles
+# with no optimization flags at all, which is 10x+ slower for Eigen-heavy code.
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+	set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE)
+	message(STATUS "No build type specified; defaulting to Release")
+endif()
+
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED OFF)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 11d7978..267b63d 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -150,7 +150,8 @@ class A2FastModel : public DSP
 
   void _load_weights(std::vector<float>& weights);
   void _ring_write(Layer& L, int num_frames);
-  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, int head_wp);
+  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last,
+                      int head_wp);
   void _head_forward(float* output, int num_frames);
 
   // Compile-time-specialized per-layer kernel. KernelSize is lifted to a
@@ -158,7 +159,10 @@ class A2FastModel : public DSP
   // FMAs across taps. For the A2 shape we only need K=6 and K=15.
   // IsFirst: true for layer 0 only. Selects = vs += when writing the head
   // accumulator directly into _head_history, avoiding a separate zeroing pass.
-  template <int KernelSize, bool IsFirst>
+  // IsLast: true for the final layer only. Its layer1x1 residual output feeds
+  // no downstream layer (the model output flows through _head_history ->
+  // _head_forward), so the 1x1 projection + residual store are dead and elided.
+  template <int KernelSize, bool IsFirst, bool IsLast>
   void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp);
 };
 
@@ -410,7 +414,7 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
 // clang fully unrolls and can schedule FMAs across taps. Called from the
 // runtime dispatcher below for each A2 kernel size (6 and 15).
 template <int Channels>
-template <int KernelSize, bool IsFirst>
+template <int KernelSize, bool IsFirst, bool IsLast>
 void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp)
 {
   constexpr int K = KernelSize;
@@ -553,11 +557,13 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
         hslot[1] += a1;
         hslot[2] += a2;
       }
-      // layer1x1 residual.
-      float* lin = &_layer_in[static_cast<size_t>(f) * 3];
-      lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
-      lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2;
-      lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2;
+      // layer1x1 residual (elided on the final layer: its output is dead).
+      if constexpr (!IsLast) {
+        float* lin = &_layer_in[static_cast<size_t>(f) * 3];
+        lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
+        lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2;
+        lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2;
+      }
     }
   }
   else
@@ -662,8 +668,11 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
       hslot_block = ztile;
     else
       hslot_block += ztile;
-    lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
-    lin_block.colwise() += l1x1_b_vec;
+    // layer1x1 residual (elided on the final layer: its output is dead).
+    if constexpr (!IsLast) {
+      lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
+      lin_block.colwise() += l1x1_b_vec;
+    }
   }
 }
 
@@ -672,19 +681,22 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
 // here means something passed the detector that shouldn't have.
 template <int Channels>
 void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first,
-                                           int head_wp)
+                                           bool is_last, int head_wp)
 {
   Layer& L = _layers[layer_idx];
   _ring_write(L, num_frames);
+  // is_first and is_last are mutually exclusive (>1 layer), so three combos.
   switch (L.kernel_size)
   {
     case 6:
-      if (is_first) _layer_forward_k<6, true>(L, cond, num_frames, head_wp);
-      else          _layer_forward_k<6, false>(L, cond, num_frames, head_wp);
+      if (is_first)     _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last) _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp);
+      else              _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp);
       break;
     case 15:
-      if (is_first) _layer_forward_k<15, true>(L, cond, num_frames, head_wp);
-      else          _layer_forward_k<15, false>(L, cond, num_frames, head_wp);
+      if (is_first)     _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last) _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp);
+      else              _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp);
       break;
     default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size));
   }
@@ -764,9 +776,9 @@ void A2FastModel<Channels>::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int
 
   // Layer forward: layer 0 assigns into _head_history (IsFirst=true),
   // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed.
-  _layer_forward(0, cond, num_frames, /*is_first=*/true, head_wp);
+  _layer_forward(0, cond, num_frames, /*is_first=*/true, /*is_last=*/false, head_wp);
   for (int li = 1; li < kNumLayers; li++)
-    _layer_forward(li, cond, num_frames, /*is_first=*/false, head_wp);
+    _layer_forward(li, cond, num_frames, /*is_first=*/false, /*is_last=*/li == kNumLayers - 1, head_wp);
 
   // Advance the write position past this buffer's worth of frames.
   #if NAM_A2_RING_MODE == 1

From 02d22775ead98b85093891013dcc5138f4ddd01c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Mon, 8 Jun 2026 10:51:55 -0700
Subject: [PATCH 6/7] Reformat A2Fast optimizations to repo clang-format style

Allman braces for the new if constexpr / dispatcher blocks, no
single-line loops, and collapse the manually-aligned declarations.
No behavioral change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 NAM/wavenet/a2_fast.cpp | 45 +++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 267b63d..62d0206 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -150,8 +150,7 @@ class A2FastModel : public DSP
 
   void _load_weights(std::vector<float>& weights);
   void _ring_write(Layer& L, int num_frames);
-  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last,
-                      int head_wp);
+  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, int head_wp);
   void _head_forward(float* output, int num_frames);
 
   // Compile-time-specialized per-layer kernel. KernelSize is lifted to a
@@ -548,17 +547,21 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
       a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope;
       // Write/accumulate directly into _head_history at the current ring slot.
       float* hslot = &_head_history[static_cast<size_t>(head_wp + f) * 3];
-      if constexpr (IsFirst) {
+      if constexpr (IsFirst)
+      {
         hslot[0] = a0;
         hslot[1] = a1;
         hslot[2] = a2;
-      } else {
+      }
+      else
+      {
         hslot[0] += a0;
         hslot[1] += a1;
         hslot[2] += a2;
       }
       // layer1x1 residual (elided on the final layer: its output is dead).
-      if constexpr (!IsLast) {
+      if constexpr (!IsLast)
+      {
         float* lin = &_layer_in[static_cast<size_t>(f) * 3];
         lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
         lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2;
@@ -617,7 +620,7 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
 
       for (int f = 0; f < nF4; f += T)
       {
-        float a[T][Channels]{};  // zero-init; register-allocated by compiler
+        float a[T][Channels]{}; // zero-init; register-allocated by compiler
 
         for (int k = 0; k < K; k++)
         {
@@ -626,8 +629,7 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
           for (int cp = 0; cp < Channels; cp++)
           {
             const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o
-            const float h0 = hb[cp],                   h1 = hb[Channels + cp],
-                         h2 = hb[2 * Channels + cp],   h3 = hb[3 * Channels + cp];
+            const float h0 = hb[cp], h1 = hb[Channels + cp], h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp];
             for (int o = 0; o < Channels; o++)
             {
               a[0][o] += Wcol[o] * h0;
@@ -645,7 +647,8 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
       for (int f = nF4; f < num_frames; f++)
       {
         float* zf = z + static_cast<size_t>(f) * Channels;
-        for (int o = 0; o < Channels; o++) zf[o] = 0.0f;
+        for (int o = 0; o < Channels; o++)
+          zf[o] = 0.0f;
         for (int k = 0; k < K; k++)
         {
           const float* W = &L.conv_w[static_cast<size_t>(k) * Channels * Channels];
@@ -654,7 +657,8 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
           {
             const float hv = h[cp];
             const float* Wcol = W + cp * Channels;
-            for (int o = 0; o < Channels; o++) zf[o] += Wcol[o] * hv;
+            for (int o = 0; o < Channels; o++)
+              zf[o] += Wcol[o] * hv;
           }
         }
       }
@@ -669,7 +673,8 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
     else
       hslot_block += ztile;
     // layer1x1 residual (elided on the final layer: its output is dead).
-    if constexpr (!IsLast) {
+    if constexpr (!IsLast)
+    {
       lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
       lin_block.colwise() += l1x1_b_vec;
     }
@@ -689,14 +694,20 @@ void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int
   switch (L.kernel_size)
   {
     case 6:
-      if (is_first)     _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp);
-      else if (is_last) _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp);
-      else              _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp);
+      if (is_first)
+        _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last)
+        _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp);
+      else
+        _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp);
       break;
     case 15:
-      if (is_first)     _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp);
-      else if (is_last) _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp);
-      else              _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp);
+      if (is_first)
+        _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last)
+        _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp);
+      else
+        _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp);
       break;
     default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size));
   }

From 14caf5ccbe4904ff08b9a1de367f8d7ef170f624 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Mon, 8 Jun 2026 15:19:29 -0700
Subject: [PATCH 7/7] Default to Debug build when no build type is specified

---
 CMakeLists.txt | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5bc221..8e9fee8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,11 +5,9 @@ project(NAM VERSION 0.4.0)
 
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
-# Default to a Release build: without this, an empty CMAKE_BUILD_TYPE compiles
-# with no optimization flags at all, which is 10x+ slower for Eigen-heavy code.
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-	set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE)
-	message(STATUS "No build type specified; defaulting to Release")
+	set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE)
+	message(STATUS "No build type specified; defaulting to Debug")
 endif()
 
 set(CMAKE_CXX_STANDARD 20)