From e9956ac9272551cccd35b9ddc863f5941843ab53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 4 Jun 2026 13:27:45 -0700 Subject: [PATCH 1/7] A2FastModel: allocate layer history buffers from a single arena Previously each of the 23 layers held its own std::vector history. Heap allocators can place these at addresses that share the same L1 cache sets, causing conflict misses on the wide-dilation layers whose tap reads, _layer_in writes, and head-history writes simultaneously compete for cache. Replace with a single contiguous allocation (_history_arena) from which each layer takes a raw pointer slice. Sequential placement naturally distributes base addresses across cache sets because each slot size mod the set stride is non-zero. Co-Authored-By: Claude Sonnet 4.6 Co-Authored-By: Landon McCoy --- NAM/wavenet/a2_fast.cpp | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 8fd8fc1..016d990 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -91,7 +91,8 @@ class A2FastModel : public DSP std::array l1x1_b{}; // Conv1D input history ring buffer, column-major (Channels rows). - std::vector history; + // Points into A2FastModel::_history_arena; do not free. + float* history = nullptr; #if NAM_A2_RING_MODE == 1 // pow2 ring + tail mirror. Storage = (pow2_size + max_buffer_size) cols. // write_pos is kept in [0, pow2_size), reads use (pos & pow2_mask) and are @@ -133,6 +134,12 @@ class A2FastModel : public DSP int _head_write_pos = 0; #endif + // Single contiguous allocation for all 23 layers' history buffers. + // Placing them in one arena rather than 23 separate vectors distributes + // their base addresses across cache sets, reducing L1 set-conflict misses + // on narrow caches (e.g. Cortex-A8 with 16KB / 4-way = 4KB set stride). + std::vector _history_arena; + // Working buffers (all Channels rows, max_buffer_size cols, col-major). std::vector _layer_in; // current layer input / next layer input (in-place residual) std::vector _head_sum; // accumulates activations across all layers @@ -298,18 +305,29 @@ void A2FastModel::SetMaxBufferSize(int maxBufferSize) _cond.assign(static_cast(maxBufferSize), 0.0f); _head_out.assign(static_cast(maxBufferSize), 0.0f); - for (auto& L : _layers) + // Compute per-layer slot sizes, then allocate one contiguous arena. + std::array slot_sizes{}; + size_t arena_total = 0; + for (int i = 0; i < kNumLayers; i++) { + Layer& L = _layers[i]; #if NAM_A2_RING_MODE == 1 L.pow2_size = next_pow2(L.max_lookback + maxBufferSize); L.pow2_mask = L.pow2_size - 1; - L.history.assign(static_cast(Channels) * (L.pow2_size + maxBufferSize), 0.0f); - L.write_pos = L.max_lookback; + slot_sizes[i] = static_cast(Channels) * (L.pow2_size + maxBufferSize); #else L.history_cols = 2 * L.max_lookback + maxBufferSize; - L.history.assign(static_cast(Channels) * L.history_cols, 0.0f); - L.write_pos = L.max_lookback; + slot_sizes[i] = static_cast(Channels) * L.history_cols; #endif + L.write_pos = L.max_lookback; + arena_total += slot_sizes[i]; + } + _history_arena.assign(arena_total, 0.0f); + size_t arena_offset = 0; + for (int i = 0; i < kNumLayers; i++) + { + _layers[i].history = _history_arena.data() + arena_offset; + arena_offset += slot_sizes[i]; } const int head_lookback = kHeadKernelSize - 1; @@ -338,7 +356,7 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) { #if NAM_A2_RING_MODE == 1 const int mbs = GetMaxBufferSize(); - float* const hist = L.history.data(); + float* const hist = L.history; const float* const src = _layer_in.data(); const int wp = L.write_pos; const int first = std::min(num_frames, L.pow2_size - wp); @@ -355,11 +373,11 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) if (L.write_pos + num_frames > L.history_cols) { const int keep = L.max_lookback; - std::memmove(L.history.data(), L.history.data() + static_cast(L.write_pos - keep) * Channels, + std::memmove(L.history, L.history + static_cast(L.write_pos - keep) * Channels, static_cast(keep) * Channels * sizeof(float)); L.write_pos = keep; } - std::memcpy(L.history.data() + static_cast(L.write_pos) * Channels, _layer_in.data(), + std::memcpy(L.history + static_cast(L.write_pos) * Channels, _layer_in.data(), static_cast(num_frames) * Channels * sizeof(float)); L.write_pos += num_frames; #endif From 5961e7960cf0a52306bdf5a8e406213f07a52ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 4 Jun 2026 13:30:31 -0700 Subject: [PATCH 2/7] A2FastModel: write head activations directly to _head_history Previously each layer accumulated its post-activation output into a separate _head_sum buffer, which was then zeroed before every block (memset of Channels*num_frames floats) and copied into _head_history by _head_ring_write (another memcpy of the same size) after all layers finished. Replace with direct writes: layer 0 assigns (IsFirst=true template parameter), layers 1-22 accumulate (IsFirst=false). The IsFirst branch resolves at compile time, so there is no runtime branch in the hot loop. _head_ring_write is removed; process() now manages the head ring write position directly, rewinding with a cheap memmove of kHeadKernelSize-1 columns when the end of the buffer is reached. Also eliminates ztile.setZero() for the Channels=8 Eigen path: tap 0 now assigns into ztile (noalias() =) rather than accumulating on top of a zero-initialised matrix, saving one Channels*num_frames write per layer. Co-Authored-By: Claude Sonnet 4.6 Co-Authored-By: Landon McCoy --- NAM/wavenet/a2_fast.cpp | 131 +++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 61 deletions(-) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 016d990..ceb67bf 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -142,7 +142,6 @@ class A2FastModel : public DSP // Working buffers (all Channels rows, max_buffer_size cols, col-major). std::vector _layer_in; // current layer input / next layer input (in-place residual) - std::vector _head_sum; // accumulates activations across all layers std::vector _z; // per-layer conv output accumulator (tap-major) std::vector _cond; // float32 copy of the double NAM_SAMPLE input, reused each block std::vector _head_out; // float32 head output before writing to NAM_SAMPLE @@ -151,15 +150,16 @@ class A2FastModel : public DSP void _load_weights(std::vector& weights); void _ring_write(Layer& L, int num_frames); - void _head_ring_write(int num_frames); - void _layer_forward(int layer_idx, const float* cond, int num_frames); + void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, int head_wp); void _head_forward(float* output, int num_frames); // Compile-time-specialized per-layer kernel. KernelSize is lifted to a // template parameter so clang can fully unroll the tap loop and schedule // FMAs across taps. For the A2 shape we only need K=6 and K=15. - template - void _layer_forward_k(Layer& L, const float* cond, int num_frames); + // IsFirst: true for layer 0 only. Selects = vs += when writing the head + // accumulator directly into _head_history, avoiding a separate zeroing pass. + template + void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp); }; // ----------------------------------------------------------------------------- @@ -300,7 +300,6 @@ void A2FastModel::SetMaxBufferSize(int maxBufferSize) DSP::SetMaxBufferSize(maxBufferSize); _layer_in.assign(static_cast(Channels) * maxBufferSize, 0.0f); - _head_sum.assign(static_cast(Channels) * maxBufferSize, 0.0f); _z.assign(static_cast(Channels) * maxBufferSize, 0.0f); _cond.assign(static_cast(maxBufferSize), 0.0f); _head_out.assign(static_cast(maxBufferSize), 0.0f); @@ -383,50 +382,18 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) #endif } -template -void A2FastModel::_head_ring_write(int num_frames) -{ - #if NAM_A2_RING_MODE == 1 - const int mbs = GetMaxBufferSize(); - float* const hist = _head_history.data(); - const float* const src = _head_sum.data(); - const int wp = _head_write_pos; - const int first = std::min(num_frames, _head_pow2_size - wp); - std::memcpy(hist + static_cast(wp) * Channels, src, static_cast(first) * Channels * sizeof(float)); - if (first < num_frames) - { - std::memcpy(hist, src + static_cast(first) * Channels, - static_cast(num_frames - first) * Channels * sizeof(float)); - } - std::memcpy( - hist + static_cast(_head_pow2_size) * Channels, hist, static_cast(mbs) * Channels * sizeof(float)); - _head_write_pos = (wp + num_frames) & _head_pow2_mask; - #else - const int keep = kHeadKernelSize - 1; - if (_head_write_pos + num_frames > _head_history_cols) - { - std::memmove(_head_history.data(), _head_history.data() + static_cast(_head_write_pos - keep) * Channels, - static_cast(keep) * Channels * sizeof(float)); - _head_write_pos = keep; - } - std::memcpy(_head_history.data() + static_cast(_head_write_pos) * Channels, _head_sum.data(), - static_cast(num_frames) * Channels * sizeof(float)); - _head_write_pos += num_frames; - #endif -} - // ----------------------------------------------------------------------------- // Per-layer forward pass. Reads current _layer_in, writes back into _layer_in // after applying dilated conv + mixin + LeakyReLU + layer1x1 residual, and -// accumulates activations into _head_sum. +// writes/accumulates activations directly into _head_history at head_wp. // ----------------------------------------------------------------------------- // Compile-time-specialized per-layer kernel. KernelSize is a template param // so the K tap loop + per-tap weight offsets become compile-time constants; // clang fully unrolls and can schedule FMAs across taps. Called from the // runtime dispatcher below for each A2 kernel size (6 and 15). template -template -void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int num_frames) +template +void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp) { constexpr int K = KernelSize; const int D = L.dilation; @@ -557,11 +524,17 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu a0 = (a0 >= 0.0f) ? a0 : a0 * kLeakySlope; a1 = (a1 >= 0.0f) ? a1 : a1 * kLeakySlope; a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope; - // Head sum accumulate. - float* hsum = &_head_sum[static_cast(f) * 3]; - hsum[0] += a0; - hsum[1] += a1; - hsum[2] += a2; + // Write/accumulate directly into _head_history at the current ring slot. + float* hslot = &_head_history[static_cast(head_wp + f) * 3]; + if constexpr (IsFirst) { + hslot[0] = a0; + hslot[1] = a1; + hslot[2] = a2; + } else { + hslot[0] += a0; + hslot[1] += a1; + hslot[2] += a2; + } // layer1x1 residual. float* lin = &_layer_in[static_cast(f) * 3]; lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; @@ -596,25 +569,29 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu Eigen::Map cond_row(cond, 1, num_frames); Eigen::Map ztile(_z.data(), Channels, num_frames); - Eigen::Map hsum_block(_head_sum.data(), Channels, num_frames); + Eigen::Map hslot_block(&_head_history[static_cast(head_wp) * Channels], Channels, num_frames); Eigen::Map lin_block(_layer_in.data(), Channels, num_frames); - ztile.setZero(); - - // Conv: one 8x8 × 8xN GEMM per tap. + // Conv: one 8x8 × 8xN GEMM per tap. Tap 0 assigns into ztile (no setZero needed). for (int k = 0; k < K; k++) { const int tap_base = tap_base_phys(K - 1 - k); Eigen::Map W(&L.conv_w[static_cast(k) * Channels * Channels]); Eigen::Map input_block(&L.history[static_cast(tap_base) * Channels], Channels, num_frames); - ztile.noalias() += W * input_block; + if (k == 0) + ztile.noalias() = W * input_block; + else + ztile.noalias() += W * input_block; } - // Post-conv: bias, mixin, LeakyReLU, head_sum, 1x1 residual — all block ops. + // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual. ztile.colwise() += conv_b_vec; ztile.noalias() += mixin_vec * cond_row; // rank-1 outer product ztile = (ztile.array() < 0.0f).select(ztile.array() * kLeakySlope, ztile.array()); - hsum_block += ztile; + if constexpr (IsFirst) + hslot_block = ztile; + else + hslot_block += ztile; lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM lin_block.colwise() += l1x1_b_vec; } @@ -624,14 +601,21 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu // For the A2 shape the detector only admits K in {6, 15}; any other value // here means something passed the detector that shouldn't have. template -void A2FastModel::_layer_forward(int layer_idx, const float* cond, int num_frames) +void A2FastModel::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, + int head_wp) { Layer& L = _layers[layer_idx]; _ring_write(L, num_frames); switch (L.kernel_size) { - case 6: _layer_forward_k<6>(L, cond, num_frames); break; - case 15: _layer_forward_k<15>(L, cond, num_frames); break; + case 6: + if (is_first) _layer_forward_k<6, true>(L, cond, num_frames, head_wp); + else _layer_forward_k<6, false>(L, cond, num_frames, head_wp); + break; + case 15: + if (is_first) _layer_forward_k<15, true>(L, cond, num_frames, head_wp); + else _layer_forward_k<15, false>(L, cond, num_frames, head_wp); + break; default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size)); } } @@ -639,10 +623,11 @@ void A2FastModel::_layer_forward(int layer_idx, const float* cond, int // ----------------------------------------------------------------------------- // Head: K=16 dilation-1 conv from Channels to 1, plus bias + scale. // ----------------------------------------------------------------------------- +// process() writes all 23 layers' activations directly into _head_history +// before calling this function, and has already advanced _head_write_pos. template void A2FastModel::_head_forward(float* output, int num_frames) { - _head_ring_write(num_frames); #if NAM_A2_RING_MODE == 1 const int mask = _head_pow2_mask; auto col_of = [&](int f, int k) { return (_head_write_pos - num_frames + f - (kHeadKernelSize - 1 - k)) & mask; }; @@ -690,11 +675,35 @@ void A2FastModel::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int lin[c] = _rechannel_w[c] * x; } - // Zero head accumulator. - std::memset(_head_sum.data(), 0, static_cast(num_frames) * Channels * sizeof(float)); + // Advance the head ring write position. Rewind if writing num_frames more + // would overflow the contiguous range, preserving the K-1 lookback window. + const int head_keep = kHeadKernelSize - 1; + #if NAM_A2_RING_MODE == 1 + const int head_cap = _head_pow2_size; + #else + const int head_cap = _head_history_cols; + #endif + if (_head_write_pos + num_frames > head_cap) + { + std::memmove(_head_history.data(), + _head_history.data() + static_cast(_head_write_pos - head_keep) * Channels, + static_cast(head_keep) * Channels * sizeof(float)); + _head_write_pos = head_keep; + } + const int head_wp = _head_write_pos; - for (int li = 0; li < kNumLayers; li++) - _layer_forward(li, cond, num_frames); + // Layer forward: layer 0 assigns into _head_history (IsFirst=true), + // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed. + _layer_forward(0, cond, num_frames, /*is_first=*/true, head_wp); + for (int li = 1; li < kNumLayers; li++) + _layer_forward(li, cond, num_frames, /*is_first=*/false, head_wp); + + // Advance the write position past this buffer's worth of frames. + #if NAM_A2_RING_MODE == 1 + _head_write_pos = (head_wp + num_frames) & _head_pow2_mask; + #else + _head_write_pos = head_wp + num_frames; + #endif // Output. float* head_out = _head_out.data(); From a77967807a68a9fc6b6183689d9e3aee39754bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 4 Jun 2026 13:48:45 -0700 Subject: [PATCH 3/7] A2FastModel: mirror-on-demand for per-layer ring writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously _ring_write always refreshed the full tail mirror — a memcpy of maxBufferSize * Channels * sizeof(float) bytes per layer per block (4 KB for Ch=8 / 128 frames, 92 KB total across 23 layers per buffer). Most of the time no tap read crosses the pow2_size boundary, so the mirror copy was pure overhead. Replace with mirror-on-demand: after each write, compute each tap's next-call read range and mirror only the columns that actually overflow past pow2_size. Zero copy is emitted whenever no tap straddles the wrap. Co-Authored-By: Claude Sonnet 4.6 Co-Authored-By: Landon McCoy --- NAM/wavenet/a2_fast.cpp | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index ceb67bf..800a6c1 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -344,17 +344,18 @@ void A2FastModel::SetMaxBufferSize(int maxBufferSize) // ----------------------------------------------------------------------------- // Ring-write helpers. -// Mode 1: pow2 + tail mirror. Constant-time per block (one short memcpy -// into the ring, one mirror refresh). -// Mode 0: linear with periodic memmove rewind. When write_pos nears the -// end of history, memmove the trailing max_lookback cols back to offset 0 -// and reset write_pos. That memmove is the jitter spike we're measuring. +// Mode 1: pow2 + tail mirror, mirror-on-demand. Reads in _layer_forward_k +// use `tap_base + f` without masking, so the mirror region at +// [pow2_size, pow2_size + mbs) must cover any reads that overflow past +// pow2_size. After each write, predict where the next call's tap reads will +// land and mirror only the columns that actually overflow — frequently zero. +// Avoids the full mbs*Channels*4 memcpy that a naive refresh would do. +// Mode 0: linear with periodic memmove rewind. // ----------------------------------------------------------------------------- template void A2FastModel::_ring_write(Layer& L, int num_frames) { #if NAM_A2_RING_MODE == 1 - const int mbs = GetMaxBufferSize(); float* const hist = L.history; const float* const src = _layer_in.data(); const int wp = L.write_pos; @@ -365,9 +366,26 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) std::memcpy(hist, src + static_cast(first) * Channels, static_cast(num_frames - first) * Channels * sizeof(float)); } - std::memcpy( - hist + static_cast(L.pow2_size) * Channels, hist, static_cast(mbs) * Channels * sizeof(float)); - L.write_pos = (wp + num_frames) & L.pow2_mask; + const int new_wp = (wp + num_frames) & L.pow2_mask; + + // Mirror-on-demand: for each tap, predict the read range in the next call + // and mirror only the columns that would overflow past pow2_size. + int mirror_needed = 0; + const int K = L.kernel_size; + const int D = L.dilation; + for (int k = 0; k < K; k++) + { + const int taps_back = K - 1 - k; + const int tap_base = (new_wp - num_frames - taps_back * D) & L.pow2_mask; + const int read_end = tap_base + num_frames - 1; + if (read_end >= L.pow2_size) + mirror_needed = std::max(mirror_needed, read_end - L.pow2_size + 1); + } + if (mirror_needed > 0) + std::memcpy(hist + static_cast(L.pow2_size) * Channels, hist, + static_cast(mirror_needed) * Channels * sizeof(float)); + + L.write_pos = new_wp; #else if (L.write_pos + num_frames > L.history_cols) { From 6c2815df386917d49b2990cb5bc8d069f1e59148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 4 Jun 2026 13:51:50 -0700 Subject: [PATCH 4/7] A2FastModel: portable T=4 frame-tiled conv for Channels >= 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the Eigen noalias() GEMM loop with an explicit T=4 frame-tiled, tap-major C++ kernel. Structure of the inner body: a[f][o] += Wcol[o] * h[f] (o = output channel, f = frame in tile) Wcol = W[:,cp] is stride-1 in o, so the o loop vectorizes. h[f] is a scalar loaded from the (column-major) history buffer, so the compiler emits a broadcast-scalar FMA instruction — vmlaq_n_f32 on AArch64, the AVX equivalent on x86 — without requiring architecture-specific intrinsics. The T=4 tiling amortizes the two weight register loads (W[:,cp] = 8 floats = 2 Q-regs on AArch64) across four independent accumulator chains, matching the weight-reuse strategy of the explicit NEON microkernel. A scalar tail handles buffer sizes that are not multiples of 4; in practice audio buffer sizes (64, 128, 256) are always multiples of 4. Co-Authored-By: Claude Sonnet 4.6 Co-Authored-By: Landon McCoy --- NAM/wavenet/a2_fast.cpp | 70 +++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 800a6c1..11d7978 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -590,16 +590,68 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu Eigen::Map hslot_block(&_head_history[static_cast(head_wp) * Channels], Channels, num_frames); Eigen::Map lin_block(_layer_in.data(), Channels, num_frames); - // Conv: one 8x8 × 8xN GEMM per tap. Tap 0 assigns into ztile (no setZero needed). - for (int k = 0; k < K; k++) + // Conv: T=4 frame-tiled, tap-major. + // + // For each tile of T=4 frames, accumulate all K taps and all Channels input + // channels into T*Channels output accumulators kept in local storage. The + // inner loop body is: + // + // a[f][o] += Wcol[o] * h[f] (o vectorized, h[f] scalar broadcast) + // + // where Wcol = W[:,cp] is stride-1 in o and h[f] = history[col+f][cp] is a + // scalar. On AArch64 the compiler emits vmlaq_n_f32 (SIMD FMA with scalar + // broadcast) with weight vector Wcol reused across all T frames — the same + // weight amortisation as an explicit NEON microkernel, without intrinsics. + // Equivalent SIMD broadcast-FMA instructions are emitted on x86 (AVX) and + // other SIMD targets automatically. { - const int tap_base = tap_base_phys(K - 1 - k); - Eigen::Map W(&L.conv_w[static_cast(k) * Channels * Channels]); - Eigen::Map input_block(&L.history[static_cast(tap_base) * Channels], Channels, num_frames); - if (k == 0) - ztile.noalias() = W * input_block; - else - ztile.noalias() += W * input_block; + float* z = _z.data(); + constexpr int T = 4; + const int nF4 = (num_frames / T) * T; + + for (int f = 0; f < nF4; f += T) + { + float a[T][Channels]{}; // zero-init; register-allocated by compiler + + for (int k = 0; k < K; k++) + { + const float* W = &L.conv_w[static_cast(k) * Channels * Channels]; + const float* hb = L.history + static_cast(tap_base_phys(K - 1 - k) + f) * Channels; + for (int cp = 0; cp < Channels; cp++) + { + const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o + const float h0 = hb[cp], h1 = hb[Channels + cp], + h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp]; + for (int o = 0; o < Channels; o++) + { + a[0][o] += Wcol[o] * h0; + a[1][o] += Wcol[o] * h1; + a[2][o] += Wcol[o] * h2; + a[3][o] += Wcol[o] * h3; + } + } + } + for (int ti = 0; ti < T; ti++) + std::memcpy(z + static_cast(f + ti) * Channels, a[ti], Channels * sizeof(float)); + } + + // Scalar tail for any frames past the T-aligned boundary. + for (int f = nF4; f < num_frames; f++) + { + float* zf = z + static_cast(f) * Channels; + for (int o = 0; o < Channels; o++) zf[o] = 0.0f; + for (int k = 0; k < K; k++) + { + const float* W = &L.conv_w[static_cast(k) * Channels * Channels]; + const float* h = L.history + static_cast(tap_base_phys(K - 1 - k) + f) * Channels; + for (int cp = 0; cp < Channels; cp++) + { + const float hv = h[cp]; + const float* Wcol = W + cp * Channels; + for (int o = 0; o < Channels; o++) zf[o] += Wcol[o] * hv; + } + } + } } // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual. From 78f92a225e4ef7db89d6a6c8afc93e42d020e581 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Mon, 8 Jun 2026 10:45:46 -0700 Subject: [PATCH 5/7] Dead terminal-residual elimination optimization + update to CMakeLists.txt to compile to Release by default. --- CMakeLists.txt | 7 +++++++ NAM/wavenet/a2_fast.cpp | 46 ++++++++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f9fe79..a5bc221 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,13 @@ project(NAM VERSION 0.4.0) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") +# Default to a Release build: without this, an empty CMAKE_BUILD_TYPE compiles +# with no optimization flags at all, which is 10x+ slower for Eigen-heavy code. +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE) + message(STATUS "No build type specified; defaulting to Release") +endif() + set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED OFF) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 11d7978..267b63d 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -150,7 +150,8 @@ class A2FastModel : public DSP void _load_weights(std::vector& weights); void _ring_write(Layer& L, int num_frames); - void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, int head_wp); + void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, + int head_wp); void _head_forward(float* output, int num_frames); // Compile-time-specialized per-layer kernel. KernelSize is lifted to a @@ -158,7 +159,10 @@ class A2FastModel : public DSP // FMAs across taps. For the A2 shape we only need K=6 and K=15. // IsFirst: true for layer 0 only. Selects = vs += when writing the head // accumulator directly into _head_history, avoiding a separate zeroing pass. - template + // IsLast: true for the final layer only. Its layer1x1 residual output feeds + // no downstream layer (the model output flows through _head_history -> + // _head_forward), so the 1x1 projection + residual store are dead and elided. + template void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp); }; @@ -410,7 +414,7 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) // clang fully unrolls and can schedule FMAs across taps. Called from the // runtime dispatcher below for each A2 kernel size (6 and 15). template -template +template void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp) { constexpr int K = KernelSize; @@ -553,11 +557,13 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu hslot[1] += a1; hslot[2] += a2; } - // layer1x1 residual. - float* lin = &_layer_in[static_cast(f) * 3]; - lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; - lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2; - lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2; + // layer1x1 residual (elided on the final layer: its output is dead). + if constexpr (!IsLast) { + float* lin = &_layer_in[static_cast(f) * 3]; + lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; + lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2; + lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2; + } } } else @@ -662,8 +668,11 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu hslot_block = ztile; else hslot_block += ztile; - lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM - lin_block.colwise() += l1x1_b_vec; + // layer1x1 residual (elided on the final layer: its output is dead). + if constexpr (!IsLast) { + lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM + lin_block.colwise() += l1x1_b_vec; + } } } @@ -672,19 +681,22 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu // here means something passed the detector that shouldn't have. template void A2FastModel::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, - int head_wp) + bool is_last, int head_wp) { Layer& L = _layers[layer_idx]; _ring_write(L, num_frames); + // is_first and is_last are mutually exclusive (>1 layer), so three combos. switch (L.kernel_size) { case 6: - if (is_first) _layer_forward_k<6, true>(L, cond, num_frames, head_wp); - else _layer_forward_k<6, false>(L, cond, num_frames, head_wp); + if (is_first) _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp); + else if (is_last) _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp); + else _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp); break; case 15: - if (is_first) _layer_forward_k<15, true>(L, cond, num_frames, head_wp); - else _layer_forward_k<15, false>(L, cond, num_frames, head_wp); + if (is_first) _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp); + else if (is_last) _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp); + else _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp); break; default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size)); } @@ -764,9 +776,9 @@ void A2FastModel::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int // Layer forward: layer 0 assigns into _head_history (IsFirst=true), // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed. - _layer_forward(0, cond, num_frames, /*is_first=*/true, head_wp); + _layer_forward(0, cond, num_frames, /*is_first=*/true, /*is_last=*/false, head_wp); for (int li = 1; li < kNumLayers; li++) - _layer_forward(li, cond, num_frames, /*is_first=*/false, head_wp); + _layer_forward(li, cond, num_frames, /*is_first=*/false, /*is_last=*/li == kNumLayers - 1, head_wp); // Advance the write position past this buffer's worth of frames. #if NAM_A2_RING_MODE == 1 From 02d22775ead98b85093891013dcc5138f4ddd01c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Mon, 8 Jun 2026 10:51:55 -0700 Subject: [PATCH 6/7] Reformat A2Fast optimizations to repo clang-format style Allman braces for the new if constexpr / dispatcher blocks, no single-line loops, and collapse the manually-aligned declarations. No behavioral change. Co-Authored-By: Claude Opus 4.8 (1M context) --- NAM/wavenet/a2_fast.cpp | 45 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 267b63d..62d0206 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -150,8 +150,7 @@ class A2FastModel : public DSP void _load_weights(std::vector& weights); void _ring_write(Layer& L, int num_frames); - void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, - int head_wp); + void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, int head_wp); void _head_forward(float* output, int num_frames); // Compile-time-specialized per-layer kernel. KernelSize is lifted to a @@ -548,17 +547,21 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope; // Write/accumulate directly into _head_history at the current ring slot. float* hslot = &_head_history[static_cast(head_wp + f) * 3]; - if constexpr (IsFirst) { + if constexpr (IsFirst) + { hslot[0] = a0; hslot[1] = a1; hslot[2] = a2; - } else { + } + else + { hslot[0] += a0; hslot[1] += a1; hslot[2] += a2; } // layer1x1 residual (elided on the final layer: its output is dead). - if constexpr (!IsLast) { + if constexpr (!IsLast) + { float* lin = &_layer_in[static_cast(f) * 3]; lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2; @@ -617,7 +620,7 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu for (int f = 0; f < nF4; f += T) { - float a[T][Channels]{}; // zero-init; register-allocated by compiler + float a[T][Channels]{}; // zero-init; register-allocated by compiler for (int k = 0; k < K; k++) { @@ -626,8 +629,7 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu for (int cp = 0; cp < Channels; cp++) { const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o - const float h0 = hb[cp], h1 = hb[Channels + cp], - h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp]; + const float h0 = hb[cp], h1 = hb[Channels + cp], h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp]; for (int o = 0; o < Channels; o++) { a[0][o] += Wcol[o] * h0; @@ -645,7 +647,8 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu for (int f = nF4; f < num_frames; f++) { float* zf = z + static_cast(f) * Channels; - for (int o = 0; o < Channels; o++) zf[o] = 0.0f; + for (int o = 0; o < Channels; o++) + zf[o] = 0.0f; for (int k = 0; k < K; k++) { const float* W = &L.conv_w[static_cast(k) * Channels * Channels]; @@ -654,7 +657,8 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu { const float hv = h[cp]; const float* Wcol = W + cp * Channels; - for (int o = 0; o < Channels; o++) zf[o] += Wcol[o] * hv; + for (int o = 0; o < Channels; o++) + zf[o] += Wcol[o] * hv; } } } @@ -669,7 +673,8 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu else hslot_block += ztile; // layer1x1 residual (elided on the final layer: its output is dead). - if constexpr (!IsLast) { + if constexpr (!IsLast) + { lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM lin_block.colwise() += l1x1_b_vec; } @@ -689,14 +694,20 @@ void A2FastModel::_layer_forward(int layer_idx, const float* cond, int switch (L.kernel_size) { case 6: - if (is_first) _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp); - else if (is_last) _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp); - else _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp); + if (is_first) + _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp); + else if (is_last) + _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp); + else + _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp); break; case 15: - if (is_first) _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp); - else if (is_last) _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp); - else _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp); + if (is_first) + _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp); + else if (is_last) + _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp); + else + _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp); break; default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size)); } From 14caf5ccbe4904ff08b9a1de367f8d7ef170f624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Mon, 8 Jun 2026 15:19:29 -0700 Subject: [PATCH 7/7] Default to Debug build when no build type is specified --- CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5bc221..8e9fee8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,11 +5,9 @@ project(NAM VERSION 0.4.0) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") -# Default to a Release build: without this, an empty CMAKE_BUILD_TYPE compiles -# with no optimization flags at all, which is 10x+ slower for Eigen-heavy code. if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE) - message(STATUS "No build type specified; defaulting to Release") + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE) + message(STATUS "No build type specified; defaulting to Debug") endif() set(CMAKE_CXX_STANDARD 20)