diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f9fe79..8e9fee8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,11 @@ project(NAM VERSION 0.4.0)
 
 set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+	set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE)
+	message(STATUS "No build type specified; defaulting to Debug")
+endif()
+
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED OFF)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp
index 8fd8fc1..62d0206 100644
--- a/NAM/wavenet/a2_fast.cpp
+++ b/NAM/wavenet/a2_fast.cpp
@@ -91,7 +91,8 @@ class A2FastModel : public DSP
     std::array<float, Channels> l1x1_b{};
 
     // Conv1D input history ring buffer, column-major (Channels rows).
-    std::vector<float> history;
+    // Points into A2FastModel::_history_arena; do not free.
+    float* history = nullptr;
   #if NAM_A2_RING_MODE == 1
     // pow2 ring + tail mirror. Storage = (pow2_size + max_buffer_size) cols.
     // write_pos is kept in [0, pow2_size), reads use (pos & pow2_mask) and are
@@ -133,9 +134,14 @@ class A2FastModel : public DSP
   int _head_write_pos = 0;
   #endif
 
+  // Single contiguous allocation for all 23 layers' history buffers.
+  // Placing them in one arena rather than 23 separate vectors distributes
+  // their base addresses across cache sets, reducing L1 set-conflict misses
+  // on narrow caches (e.g. Cortex-A8 with 16KB / 4-way = 4KB set stride).
+  std::vector<float> _history_arena;
+
   // Working buffers (all Channels rows, max_buffer_size cols, col-major).
   std::vector<float> _layer_in; // current layer input / next layer input (in-place residual)
-  std::vector<float> _head_sum; // accumulates activations across all layers
   std::vector<float> _z; // per-layer conv output accumulator (tap-major)
   std::vector<float> _cond; // float32 copy of the double NAM_SAMPLE input, reused each block
   std::vector<float> _head_out; // float32 head output before writing to NAM_SAMPLE
@@ -144,15 +150,19 @@ class A2FastModel : public DSP
 
   void _load_weights(std::vector<float>& weights);
   void _ring_write(Layer& L, int num_frames);
-  void _head_ring_write(int num_frames);
-  void _layer_forward(int layer_idx, const float* cond, int num_frames);
+  void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, int head_wp);
   void _head_forward(float* output, int num_frames);
 
   // Compile-time-specialized per-layer kernel. KernelSize is lifted to a
   // template parameter so clang can fully unroll the tap loop and schedule
   // FMAs across taps. For the A2 shape we only need K=6 and K=15.
-  template <int KernelSize>
-  void _layer_forward_k(Layer& L, const float* cond, int num_frames);
+  // IsFirst: true for layer 0 only. Selects = vs += when writing the head
+  // accumulator directly into _head_history, avoiding a separate zeroing pass.
+  // IsLast: true for the final layer only. Its layer1x1 residual output feeds
+  // no downstream layer (the model output flows through _head_history ->
+  // _head_forward), so the 1x1 projection + residual store are dead and elided.
+  template <int KernelSize, bool IsFirst, bool IsLast>
+  void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp);
 };
 
 // -----------------------------------------------------------------------------
@@ -293,23 +303,33 @@ void A2FastModel<Channels>::SetMaxBufferSize(int maxBufferSize)
   DSP::SetMaxBufferSize(maxBufferSize);
 
   _layer_in.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
-  _head_sum.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
   _z.assign(static_cast<size_t>(Channels) * maxBufferSize, 0.0f);
   _cond.assign(static_cast<size_t>(maxBufferSize), 0.0f);
   _head_out.assign(static_cast<size_t>(maxBufferSize), 0.0f);
 
-  for (auto& L : _layers)
+  // Compute per-layer slot sizes, then allocate one contiguous arena.
+  std::array<size_t, kNumLayers> slot_sizes{};
+  size_t arena_total = 0;
+  for (int i = 0; i < kNumLayers; i++)
   {
+    Layer& L = _layers[i];
   #if NAM_A2_RING_MODE == 1
     L.pow2_size = next_pow2(L.max_lookback + maxBufferSize);
     L.pow2_mask = L.pow2_size - 1;
-    L.history.assign(static_cast<size_t>(Channels) * (L.pow2_size + maxBufferSize), 0.0f);
-    L.write_pos = L.max_lookback;
+    slot_sizes[i] = static_cast<size_t>(Channels) * (L.pow2_size + maxBufferSize);
   #else
     L.history_cols = 2 * L.max_lookback + maxBufferSize;
-    L.history.assign(static_cast<size_t>(Channels) * L.history_cols, 0.0f);
-    L.write_pos = L.max_lookback;
+    slot_sizes[i] = static_cast<size_t>(Channels) * L.history_cols;
   #endif
+    L.write_pos = L.max_lookback;
+    arena_total += slot_sizes[i];
+  }
+  _history_arena.assign(arena_total, 0.0f);
+  size_t arena_offset = 0;
+  for (int i = 0; i < kNumLayers; i++)
+  {
+    _layers[i].history = _history_arena.data() + arena_offset;
+    arena_offset += slot_sizes[i];
   }
 
   const int head_lookback = kHeadKernelSize - 1;
@@ -327,18 +347,19 @@ void A2FastModel<Channels>::SetMaxBufferSize(int maxBufferSize)
 
 // -----------------------------------------------------------------------------
 // Ring-write helpers.
-//   Mode 1: pow2 + tail mirror. Constant-time per block (one short memcpy
-//   into the ring, one mirror refresh).
-//   Mode 0: linear with periodic memmove rewind. When write_pos nears the
-//   end of history, memmove the trailing max_lookback cols back to offset 0
-//   and reset write_pos. That memmove is the jitter spike we're measuring.
+//   Mode 1: pow2 + tail mirror, mirror-on-demand. Reads in _layer_forward_k
+//   use `tap_base + f` without masking, so the mirror region at
+//   [pow2_size, pow2_size + mbs) must cover any reads that overflow past
+//   pow2_size. After each write, predict where the next call's tap reads will
+//   land and mirror only the columns that actually overflow — frequently zero.
+//   Avoids the full mbs*Channels*4 memcpy that a naive refresh would do.
+//   Mode 0: linear with periodic memmove rewind.
 // -----------------------------------------------------------------------------
 template <int Channels>
 void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
 {
   #if NAM_A2_RING_MODE == 1
-  const int mbs = GetMaxBufferSize();
-  float* const hist = L.history.data();
+  float* const hist = L.history;
   const float* const src = _layer_in.data();
   const int wp = L.write_pos;
   const int first = std::min(num_frames, L.pow2_size - wp);
@@ -348,67 +369,52 @@ void A2FastModel<Channels>::_ring_write(Layer& L, int num_frames)
     std::memcpy(hist, src + static_cast<size_t>(first) * Channels,
                 static_cast<size_t>(num_frames - first) * Channels * sizeof(float));
   }
-  std::memcpy(
-    hist + static_cast<size_t>(L.pow2_size) * Channels, hist, static_cast<size_t>(mbs) * Channels * sizeof(float));
-  L.write_pos = (wp + num_frames) & L.pow2_mask;
+  const int new_wp = (wp + num_frames) & L.pow2_mask;
+
+  // Mirror-on-demand: for each tap, predict the read range in the next call
+  // and mirror only the columns that would overflow past pow2_size.
+  int mirror_needed = 0;
+  const int K = L.kernel_size;
+  const int D = L.dilation;
+  for (int k = 0; k < K; k++)
+  {
+    const int taps_back = K - 1 - k;
+    const int tap_base = (new_wp - num_frames - taps_back * D) & L.pow2_mask;
+    const int read_end = tap_base + num_frames - 1;
+    if (read_end >= L.pow2_size)
+      mirror_needed = std::max(mirror_needed, read_end - L.pow2_size + 1);
+  }
+  if (mirror_needed > 0)
+    std::memcpy(hist + static_cast<size_t>(L.pow2_size) * Channels, hist,
+                static_cast<size_t>(mirror_needed) * Channels * sizeof(float));
+
+  L.write_pos = new_wp;
   #else
   if (L.write_pos + num_frames > L.history_cols)
   {
     const int keep = L.max_lookback;
-    std::memmove(L.history.data(), L.history.data() + static_cast<size_t>(L.write_pos - keep) * Channels,
+    std::memmove(L.history, L.history + static_cast<size_t>(L.write_pos - keep) * Channels,
                  static_cast<size_t>(keep) * Channels * sizeof(float));
     L.write_pos = keep;
   }
-  std::memcpy(L.history.data() + static_cast<size_t>(L.write_pos) * Channels, _layer_in.data(),
+  std::memcpy(L.history + static_cast<size_t>(L.write_pos) * Channels, _layer_in.data(),
               static_cast<size_t>(num_frames) * Channels * sizeof(float));
   L.write_pos += num_frames;
   #endif
 }
 
-template <int Channels>
-void A2FastModel<Channels>::_head_ring_write(int num_frames)
-{
-  #if NAM_A2_RING_MODE == 1
-  const int mbs = GetMaxBufferSize();
-  float* const hist = _head_history.data();
-  const float* const src = _head_sum.data();
-  const int wp = _head_write_pos;
-  const int first = std::min(num_frames, _head_pow2_size - wp);
-  std::memcpy(hist + static_cast<size_t>(wp) * Channels, src, static_cast<size_t>(first) * Channels * sizeof(float));
-  if (first < num_frames)
-  {
-    std::memcpy(hist, src + static_cast<size_t>(first) * Channels,
-                static_cast<size_t>(num_frames - first) * Channels * sizeof(float));
-  }
-  std::memcpy(
-    hist + static_cast<size_t>(_head_pow2_size) * Channels, hist, static_cast<size_t>(mbs) * Channels * sizeof(float));
-  _head_write_pos = (wp + num_frames) & _head_pow2_mask;
-  #else
-  const int keep = kHeadKernelSize - 1;
-  if (_head_write_pos + num_frames > _head_history_cols)
-  {
-    std::memmove(_head_history.data(), _head_history.data() + static_cast<size_t>(_head_write_pos - keep) * Channels,
-                 static_cast<size_t>(keep) * Channels * sizeof(float));
-    _head_write_pos = keep;
-  }
-  std::memcpy(_head_history.data() + static_cast<size_t>(_head_write_pos) * Channels, _head_sum.data(),
-              static_cast<size_t>(num_frames) * Channels * sizeof(float));
-  _head_write_pos += num_frames;
-  #endif
-}
-
 // -----------------------------------------------------------------------------
 // Per-layer forward pass. Reads current _layer_in, writes back into _layer_in
 // after applying dilated conv + mixin + LeakyReLU + layer1x1 residual, and
-// accumulates activations into _head_sum.
+// writes/accumulates activations directly into _head_history at head_wp.
 // -----------------------------------------------------------------------------
 // Compile-time-specialized per-layer kernel. KernelSize is a template param
 // so the K tap loop + per-tap weight offsets become compile-time constants;
 // clang fully unrolls and can schedule FMAs across taps. Called from the
 // runtime dispatcher below for each A2 kernel size (6 and 15).
 template <int Channels>
-template <int KernelSize>
-void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int num_frames)
+template <int KernelSize, bool IsFirst, bool IsLast>
+void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp)
 {
   constexpr int K = KernelSize;
   const int D = L.dilation;
@@ -539,16 +545,28 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
       a0 = (a0 >= 0.0f) ? a0 : a0 * kLeakySlope;
       a1 = (a1 >= 0.0f) ? a1 : a1 * kLeakySlope;
       a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope;
-      // Head sum accumulate.
-      float* hsum = &_head_sum[static_cast<size_t>(f) * 3];
-      hsum[0] += a0;
-      hsum[1] += a1;
-      hsum[2] += a2;
-      // layer1x1 residual.
-      float* lin = &_layer_in[static_cast<size_t>(f) * 3];
-      lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
-      lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2;
-      lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2;
+      // Write/accumulate directly into _head_history at the current ring slot.
+      float* hslot = &_head_history[static_cast<size_t>(head_wp + f) * 3];
+      if constexpr (IsFirst)
+      {
+        hslot[0] = a0;
+        hslot[1] = a1;
+        hslot[2] = a2;
+      }
+      else
+      {
+        hslot[0] += a0;
+        hslot[1] += a1;
+        hslot[2] += a2;
+      }
+      // layer1x1 residual (elided on the final layer: its output is dead).
+      if constexpr (!IsLast)
+      {
+        float* lin = &_layer_in[static_cast<size_t>(f) * 3];
+        lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2;
+        lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2;
+        lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2;
+      }
     }
   }
   else
@@ -578,27 +596,88 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
     Eigen::Map<const RowDyn> cond_row(cond, 1, num_frames);
 
     Eigen::Map<MatCDyn> ztile(_z.data(), Channels, num_frames);
-    Eigen::Map<MatCDyn> hsum_block(_head_sum.data(), Channels, num_frames);
+    Eigen::Map<MatCDyn> hslot_block(&_head_history[static_cast<size_t>(head_wp) * Channels], Channels, num_frames);
     Eigen::Map<MatCDyn> lin_block(_layer_in.data(), Channels, num_frames);
 
-    ztile.setZero();
-
-    // Conv: one 8x8 × 8xN GEMM per tap.
-    for (int k = 0; k < K; k++)
+    // Conv: T=4 frame-tiled, tap-major.
+    //
+    // For each tile of T=4 frames, accumulate all K taps and all Channels input
+    // channels into T*Channels output accumulators kept in local storage. The
+    // inner loop body is:
+    //
+    //   a[f][o] += Wcol[o] * h[f]   (o vectorized, h[f] scalar broadcast)
+    //
+    // where Wcol = W[:,cp] is stride-1 in o and h[f] = history[col+f][cp] is a
+    // scalar. On AArch64 the compiler emits vmlaq_n_f32 (SIMD FMA with scalar
+    // broadcast) with weight vector Wcol reused across all T frames — the same
+    // weight amortisation as an explicit NEON microkernel, without intrinsics.
+    // Equivalent SIMD broadcast-FMA instructions are emitted on x86 (AVX) and
+    // other SIMD targets automatically.
     {
-      const int tap_base = tap_base_phys(K - 1 - k);
-      Eigen::Map<const MatCC> W(&L.conv_w[static_cast<size_t>(k) * Channels * Channels]);
-      Eigen::Map<const MatCDyn> input_block(&L.history[static_cast<size_t>(tap_base) * Channels], Channels, num_frames);
-      ztile.noalias() += W * input_block;
+      float* z = _z.data();
+      constexpr int T = 4;
+      const int nF4 = (num_frames / T) * T;
+
+      for (int f = 0; f < nF4; f += T)
+      {
+        float a[T][Channels]{}; // zero-init; register-allocated by compiler
+
+        for (int k = 0; k < K; k++)
+        {
+          const float* W = &L.conv_w[static_cast<size_t>(k) * Channels * Channels];
+          const float* hb = L.history + static_cast<size_t>(tap_base_phys(K - 1 - k) + f) * Channels;
+          for (int cp = 0; cp < Channels; cp++)
+          {
+            const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o
+            const float h0 = hb[cp], h1 = hb[Channels + cp], h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp];
+            for (int o = 0; o < Channels; o++)
+            {
+              a[0][o] += Wcol[o] * h0;
+              a[1][o] += Wcol[o] * h1;
+              a[2][o] += Wcol[o] * h2;
+              a[3][o] += Wcol[o] * h3;
+            }
+          }
+        }
+        for (int ti = 0; ti < T; ti++)
+          std::memcpy(z + static_cast<size_t>(f + ti) * Channels, a[ti], Channels * sizeof(float));
+      }
+
+      // Scalar tail for any frames past the T-aligned boundary.
+      for (int f = nF4; f < num_frames; f++)
+      {
+        float* zf = z + static_cast<size_t>(f) * Channels;
+        for (int o = 0; o < Channels; o++)
+          zf[o] = 0.0f;
+        for (int k = 0; k < K; k++)
+        {
+          const float* W = &L.conv_w[static_cast<size_t>(k) * Channels * Channels];
+          const float* h = L.history + static_cast<size_t>(tap_base_phys(K - 1 - k) + f) * Channels;
+          for (int cp = 0; cp < Channels; cp++)
+          {
+            const float hv = h[cp];
+            const float* Wcol = W + cp * Channels;
+            for (int o = 0; o < Channels; o++)
+              zf[o] += Wcol[o] * hv;
+          }
+        }
+      }
     }
 
-    // Post-conv: bias, mixin, LeakyReLU, head_sum, 1x1 residual — all block ops.
+    // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual.
     ztile.colwise() += conv_b_vec;
     ztile.noalias() += mixin_vec * cond_row; // rank-1 outer product
     ztile = (ztile.array() < 0.0f).select(ztile.array() * kLeakySlope, ztile.array());
-    hsum_block += ztile;
-    lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
-    lin_block.colwise() += l1x1_b_vec;
+    if constexpr (IsFirst)
+      hslot_block = ztile;
+    else
+      hslot_block += ztile;
+    // layer1x1 residual (elided on the final layer: its output is dead).
+    if constexpr (!IsLast)
+    {
+      lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM
+      lin_block.colwise() += l1x1_b_vec;
+    }
   }
 }
 
@@ -606,14 +685,30 @@ void A2FastModel<Channels>::_layer_forward_k(Layer& L, const float* cond, int nu
 // For the A2 shape the detector only admits K in {6, 15}; any other value
 // here means something passed the detector that shouldn't have.
 template <int Channels>
-void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int num_frames)
+void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first,
+                                           bool is_last, int head_wp)
 {
   Layer& L = _layers[layer_idx];
   _ring_write(L, num_frames);
+  // is_first and is_last are mutually exclusive (>1 layer), so three combos.
   switch (L.kernel_size)
   {
-    case 6: _layer_forward_k<6>(L, cond, num_frames); break;
-    case 15: _layer_forward_k<15>(L, cond, num_frames); break;
+    case 6:
+      if (is_first)
+        _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last)
+        _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp);
+      else
+        _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp);
+      break;
+    case 15:
+      if (is_first)
+        _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp);
+      else if (is_last)
+        _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp);
+      else
+        _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp);
+      break;
     default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size));
   }
 }
@@ -621,10 +716,11 @@ void A2FastModel<Channels>::_layer_forward(int layer_idx, const float* cond, int
 // -----------------------------------------------------------------------------
 // Head: K=16 dilation-1 conv from Channels to 1, plus bias + scale.
 // -----------------------------------------------------------------------------
+// process() writes all 23 layers' activations directly into _head_history
+// before calling this function, and has already advanced _head_write_pos.
 template <int Channels>
 void A2FastModel<Channels>::_head_forward(float* output, int num_frames)
 {
-  _head_ring_write(num_frames);
   #if NAM_A2_RING_MODE == 1
   const int mask = _head_pow2_mask;
   auto col_of = [&](int f, int k) { return (_head_write_pos - num_frames + f - (kHeadKernelSize - 1 - k)) & mask; };
@@ -672,11 +768,35 @@ void A2FastModel<Channels>::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int
       lin[c] = _rechannel_w[c] * x;
   }
 
-  // Zero head accumulator.
-  std::memset(_head_sum.data(), 0, static_cast<size_t>(num_frames) * Channels * sizeof(float));
+  // Advance the head ring write position. Rewind if writing num_frames more
+  // would overflow the contiguous range, preserving the K-1 lookback window.
+  const int head_keep = kHeadKernelSize - 1;
+  #if NAM_A2_RING_MODE == 1
+  const int head_cap = _head_pow2_size;
+  #else
+  const int head_cap = _head_history_cols;
+  #endif
+  if (_head_write_pos + num_frames > head_cap)
+  {
+    std::memmove(_head_history.data(),
+                 _head_history.data() + static_cast<size_t>(_head_write_pos - head_keep) * Channels,
+                 static_cast<size_t>(head_keep) * Channels * sizeof(float));
+    _head_write_pos = head_keep;
+  }
+  const int head_wp = _head_write_pos;
+
+  // Layer forward: layer 0 assigns into _head_history (IsFirst=true),
+  // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed.
+  _layer_forward(0, cond, num_frames, /*is_first=*/true, /*is_last=*/false, head_wp);
+  for (int li = 1; li < kNumLayers; li++)
+    _layer_forward(li, cond, num_frames, /*is_first=*/false, /*is_last=*/li == kNumLayers - 1, head_wp);
 
-  for (int li = 0; li < kNumLayers; li++)
-    _layer_forward(li, cond, num_frames);
+  // Advance the write position past this buffer's worth of frames.
+  #if NAM_A2_RING_MODE == 1
+  _head_write_pos = (head_wp + num_frames) & _head_pow2_mask;
+  #else
+  _head_write_pos = head_wp + num_frames;
+  #endif
 
   // Output.
   float* head_out = _head_out.data();