diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f9fe79..8e9fee8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,11 @@ project(NAM VERSION 0.4.0) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type (Release, Debug, RelWithDebInfo, MinSizeRel)" FORCE) + message(STATUS "No build type specified; defaulting to Debug") +endif() + set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED OFF) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/NAM/wavenet/a2_fast.cpp b/NAM/wavenet/a2_fast.cpp index 8fd8fc1..62d0206 100644 --- a/NAM/wavenet/a2_fast.cpp +++ b/NAM/wavenet/a2_fast.cpp @@ -91,7 +91,8 @@ class A2FastModel : public DSP std::array l1x1_b{}; // Conv1D input history ring buffer, column-major (Channels rows). - std::vector history; + // Points into A2FastModel::_history_arena; do not free. + float* history = nullptr; #if NAM_A2_RING_MODE == 1 // pow2 ring + tail mirror. Storage = (pow2_size + max_buffer_size) cols. // write_pos is kept in [0, pow2_size), reads use (pos & pow2_mask) and are @@ -133,9 +134,14 @@ class A2FastModel : public DSP int _head_write_pos = 0; #endif + // Single contiguous allocation for all 23 layers' history buffers. + // Placing them in one arena rather than 23 separate vectors distributes + // their base addresses across cache sets, reducing L1 set-conflict misses + // on narrow caches (e.g. Cortex-A8 with 16KB / 4-way = 4KB set stride). + std::vector _history_arena; + // Working buffers (all Channels rows, max_buffer_size cols, col-major). std::vector _layer_in; // current layer input / next layer input (in-place residual) - std::vector _head_sum; // accumulates activations across all layers std::vector _z; // per-layer conv output accumulator (tap-major) std::vector _cond; // float32 copy of the double NAM_SAMPLE input, reused each block std::vector _head_out; // float32 head output before writing to NAM_SAMPLE @@ -144,15 +150,19 @@ class A2FastModel : public DSP void _load_weights(std::vector& weights); void _ring_write(Layer& L, int num_frames); - void _head_ring_write(int num_frames); - void _layer_forward(int layer_idx, const float* cond, int num_frames); + void _layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, bool is_last, int head_wp); void _head_forward(float* output, int num_frames); // Compile-time-specialized per-layer kernel. KernelSize is lifted to a // template parameter so clang can fully unroll the tap loop and schedule // FMAs across taps. For the A2 shape we only need K=6 and K=15. - template - void _layer_forward_k(Layer& L, const float* cond, int num_frames); + // IsFirst: true for layer 0 only. Selects = vs += when writing the head + // accumulator directly into _head_history, avoiding a separate zeroing pass. + // IsLast: true for the final layer only. Its layer1x1 residual output feeds + // no downstream layer (the model output flows through _head_history -> + // _head_forward), so the 1x1 projection + residual store are dead and elided. + template + void _layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp); }; // ----------------------------------------------------------------------------- @@ -293,23 +303,33 @@ void A2FastModel::SetMaxBufferSize(int maxBufferSize) DSP::SetMaxBufferSize(maxBufferSize); _layer_in.assign(static_cast(Channels) * maxBufferSize, 0.0f); - _head_sum.assign(static_cast(Channels) * maxBufferSize, 0.0f); _z.assign(static_cast(Channels) * maxBufferSize, 0.0f); _cond.assign(static_cast(maxBufferSize), 0.0f); _head_out.assign(static_cast(maxBufferSize), 0.0f); - for (auto& L : _layers) + // Compute per-layer slot sizes, then allocate one contiguous arena. + std::array slot_sizes{}; + size_t arena_total = 0; + for (int i = 0; i < kNumLayers; i++) { + Layer& L = _layers[i]; #if NAM_A2_RING_MODE == 1 L.pow2_size = next_pow2(L.max_lookback + maxBufferSize); L.pow2_mask = L.pow2_size - 1; - L.history.assign(static_cast(Channels) * (L.pow2_size + maxBufferSize), 0.0f); - L.write_pos = L.max_lookback; + slot_sizes[i] = static_cast(Channels) * (L.pow2_size + maxBufferSize); #else L.history_cols = 2 * L.max_lookback + maxBufferSize; - L.history.assign(static_cast(Channels) * L.history_cols, 0.0f); - L.write_pos = L.max_lookback; + slot_sizes[i] = static_cast(Channels) * L.history_cols; #endif + L.write_pos = L.max_lookback; + arena_total += slot_sizes[i]; + } + _history_arena.assign(arena_total, 0.0f); + size_t arena_offset = 0; + for (int i = 0; i < kNumLayers; i++) + { + _layers[i].history = _history_arena.data() + arena_offset; + arena_offset += slot_sizes[i]; } const int head_lookback = kHeadKernelSize - 1; @@ -327,18 +347,19 @@ void A2FastModel::SetMaxBufferSize(int maxBufferSize) // ----------------------------------------------------------------------------- // Ring-write helpers. -// Mode 1: pow2 + tail mirror. Constant-time per block (one short memcpy -// into the ring, one mirror refresh). -// Mode 0: linear with periodic memmove rewind. When write_pos nears the -// end of history, memmove the trailing max_lookback cols back to offset 0 -// and reset write_pos. That memmove is the jitter spike we're measuring. +// Mode 1: pow2 + tail mirror, mirror-on-demand. Reads in _layer_forward_k +// use `tap_base + f` without masking, so the mirror region at +// [pow2_size, pow2_size + mbs) must cover any reads that overflow past +// pow2_size. After each write, predict where the next call's tap reads will +// land and mirror only the columns that actually overflow — frequently zero. +// Avoids the full mbs*Channels*4 memcpy that a naive refresh would do. +// Mode 0: linear with periodic memmove rewind. // ----------------------------------------------------------------------------- template void A2FastModel::_ring_write(Layer& L, int num_frames) { #if NAM_A2_RING_MODE == 1 - const int mbs = GetMaxBufferSize(); - float* const hist = L.history.data(); + float* const hist = L.history; const float* const src = _layer_in.data(); const int wp = L.write_pos; const int first = std::min(num_frames, L.pow2_size - wp); @@ -348,67 +369,52 @@ void A2FastModel::_ring_write(Layer& L, int num_frames) std::memcpy(hist, src + static_cast(first) * Channels, static_cast(num_frames - first) * Channels * sizeof(float)); } - std::memcpy( - hist + static_cast(L.pow2_size) * Channels, hist, static_cast(mbs) * Channels * sizeof(float)); - L.write_pos = (wp + num_frames) & L.pow2_mask; + const int new_wp = (wp + num_frames) & L.pow2_mask; + + // Mirror-on-demand: for each tap, predict the read range in the next call + // and mirror only the columns that would overflow past pow2_size. + int mirror_needed = 0; + const int K = L.kernel_size; + const int D = L.dilation; + for (int k = 0; k < K; k++) + { + const int taps_back = K - 1 - k; + const int tap_base = (new_wp - num_frames - taps_back * D) & L.pow2_mask; + const int read_end = tap_base + num_frames - 1; + if (read_end >= L.pow2_size) + mirror_needed = std::max(mirror_needed, read_end - L.pow2_size + 1); + } + if (mirror_needed > 0) + std::memcpy(hist + static_cast(L.pow2_size) * Channels, hist, + static_cast(mirror_needed) * Channels * sizeof(float)); + + L.write_pos = new_wp; #else if (L.write_pos + num_frames > L.history_cols) { const int keep = L.max_lookback; - std::memmove(L.history.data(), L.history.data() + static_cast(L.write_pos - keep) * Channels, + std::memmove(L.history, L.history + static_cast(L.write_pos - keep) * Channels, static_cast(keep) * Channels * sizeof(float)); L.write_pos = keep; } - std::memcpy(L.history.data() + static_cast(L.write_pos) * Channels, _layer_in.data(), + std::memcpy(L.history + static_cast(L.write_pos) * Channels, _layer_in.data(), static_cast(num_frames) * Channels * sizeof(float)); L.write_pos += num_frames; #endif } -template -void A2FastModel::_head_ring_write(int num_frames) -{ - #if NAM_A2_RING_MODE == 1 - const int mbs = GetMaxBufferSize(); - float* const hist = _head_history.data(); - const float* const src = _head_sum.data(); - const int wp = _head_write_pos; - const int first = std::min(num_frames, _head_pow2_size - wp); - std::memcpy(hist + static_cast(wp) * Channels, src, static_cast(first) * Channels * sizeof(float)); - if (first < num_frames) - { - std::memcpy(hist, src + static_cast(first) * Channels, - static_cast(num_frames - first) * Channels * sizeof(float)); - } - std::memcpy( - hist + static_cast(_head_pow2_size) * Channels, hist, static_cast(mbs) * Channels * sizeof(float)); - _head_write_pos = (wp + num_frames) & _head_pow2_mask; - #else - const int keep = kHeadKernelSize - 1; - if (_head_write_pos + num_frames > _head_history_cols) - { - std::memmove(_head_history.data(), _head_history.data() + static_cast(_head_write_pos - keep) * Channels, - static_cast(keep) * Channels * sizeof(float)); - _head_write_pos = keep; - } - std::memcpy(_head_history.data() + static_cast(_head_write_pos) * Channels, _head_sum.data(), - static_cast(num_frames) * Channels * sizeof(float)); - _head_write_pos += num_frames; - #endif -} - // ----------------------------------------------------------------------------- // Per-layer forward pass. Reads current _layer_in, writes back into _layer_in // after applying dilated conv + mixin + LeakyReLU + layer1x1 residual, and -// accumulates activations into _head_sum. +// writes/accumulates activations directly into _head_history at head_wp. // ----------------------------------------------------------------------------- // Compile-time-specialized per-layer kernel. KernelSize is a template param // so the K tap loop + per-tap weight offsets become compile-time constants; // clang fully unrolls and can schedule FMAs across taps. Called from the // runtime dispatcher below for each A2 kernel size (6 and 15). template -template -void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int num_frames) +template +void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int num_frames, int head_wp) { constexpr int K = KernelSize; const int D = L.dilation; @@ -539,16 +545,28 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu a0 = (a0 >= 0.0f) ? a0 : a0 * kLeakySlope; a1 = (a1 >= 0.0f) ? a1 : a1 * kLeakySlope; a2 = (a2 >= 0.0f) ? a2 : a2 * kLeakySlope; - // Head sum accumulate. - float* hsum = &_head_sum[static_cast(f) * 3]; - hsum[0] += a0; - hsum[1] += a1; - hsum[2] += a2; - // layer1x1 residual. - float* lin = &_layer_in[static_cast(f) * 3]; - lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; - lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2; - lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2; + // Write/accumulate directly into _head_history at the current ring slot. + float* hslot = &_head_history[static_cast(head_wp + f) * 3]; + if constexpr (IsFirst) + { + hslot[0] = a0; + hslot[1] = a1; + hslot[2] = a2; + } + else + { + hslot[0] += a0; + hslot[1] += a1; + hslot[2] += a2; + } + // layer1x1 residual (elided on the final layer: its output is dead). + if constexpr (!IsLast) + { + float* lin = &_layer_in[static_cast(f) * 3]; + lin[0] += lb0 + lw00 * a0 + lw10 * a1 + lw20 * a2; + lin[1] += lb1 + lw01 * a0 + lw11 * a1 + lw21 * a2; + lin[2] += lb2 + lw02 * a0 + lw12 * a1 + lw22 * a2; + } } } else @@ -578,27 +596,88 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu Eigen::Map cond_row(cond, 1, num_frames); Eigen::Map ztile(_z.data(), Channels, num_frames); - Eigen::Map hsum_block(_head_sum.data(), Channels, num_frames); + Eigen::Map hslot_block(&_head_history[static_cast(head_wp) * Channels], Channels, num_frames); Eigen::Map lin_block(_layer_in.data(), Channels, num_frames); - ztile.setZero(); - - // Conv: one 8x8 × 8xN GEMM per tap. - for (int k = 0; k < K; k++) + // Conv: T=4 frame-tiled, tap-major. + // + // For each tile of T=4 frames, accumulate all K taps and all Channels input + // channels into T*Channels output accumulators kept in local storage. The + // inner loop body is: + // + // a[f][o] += Wcol[o] * h[f] (o vectorized, h[f] scalar broadcast) + // + // where Wcol = W[:,cp] is stride-1 in o and h[f] = history[col+f][cp] is a + // scalar. On AArch64 the compiler emits vmlaq_n_f32 (SIMD FMA with scalar + // broadcast) with weight vector Wcol reused across all T frames — the same + // weight amortisation as an explicit NEON microkernel, without intrinsics. + // Equivalent SIMD broadcast-FMA instructions are emitted on x86 (AVX) and + // other SIMD targets automatically. { - const int tap_base = tap_base_phys(K - 1 - k); - Eigen::Map W(&L.conv_w[static_cast(k) * Channels * Channels]); - Eigen::Map input_block(&L.history[static_cast(tap_base) * Channels], Channels, num_frames); - ztile.noalias() += W * input_block; + float* z = _z.data(); + constexpr int T = 4; + const int nF4 = (num_frames / T) * T; + + for (int f = 0; f < nF4; f += T) + { + float a[T][Channels]{}; // zero-init; register-allocated by compiler + + for (int k = 0; k < K; k++) + { + const float* W = &L.conv_w[static_cast(k) * Channels * Channels]; + const float* hb = L.history + static_cast(tap_base_phys(K - 1 - k) + f) * Channels; + for (int cp = 0; cp < Channels; cp++) + { + const float* Wcol = W + cp * Channels; // stride-1 → vectorized over o + const float h0 = hb[cp], h1 = hb[Channels + cp], h2 = hb[2 * Channels + cp], h3 = hb[3 * Channels + cp]; + for (int o = 0; o < Channels; o++) + { + a[0][o] += Wcol[o] * h0; + a[1][o] += Wcol[o] * h1; + a[2][o] += Wcol[o] * h2; + a[3][o] += Wcol[o] * h3; + } + } + } + for (int ti = 0; ti < T; ti++) + std::memcpy(z + static_cast(f + ti) * Channels, a[ti], Channels * sizeof(float)); + } + + // Scalar tail for any frames past the T-aligned boundary. + for (int f = nF4; f < num_frames; f++) + { + float* zf = z + static_cast(f) * Channels; + for (int o = 0; o < Channels; o++) + zf[o] = 0.0f; + for (int k = 0; k < K; k++) + { + const float* W = &L.conv_w[static_cast(k) * Channels * Channels]; + const float* h = L.history + static_cast(tap_base_phys(K - 1 - k) + f) * Channels; + for (int cp = 0; cp < Channels; cp++) + { + const float hv = h[cp]; + const float* Wcol = W + cp * Channels; + for (int o = 0; o < Channels; o++) + zf[o] += Wcol[o] * hv; + } + } + } } - // Post-conv: bias, mixin, LeakyReLU, head_sum, 1x1 residual — all block ops. + // Post-conv: bias, mixin, LeakyReLU, head accumulate, 1x1 residual. ztile.colwise() += conv_b_vec; ztile.noalias() += mixin_vec * cond_row; // rank-1 outer product ztile = (ztile.array() < 0.0f).select(ztile.array() * kLeakySlope, ztile.array()); - hsum_block += ztile; - lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM - lin_block.colwise() += l1x1_b_vec; + if constexpr (IsFirst) + hslot_block = ztile; + else + hslot_block += ztile; + // layer1x1 residual (elided on the final layer: its output is dead). + if constexpr (!IsLast) + { + lin_block.noalias() += l1x1_mat * ztile; // 8x8 × 8xN GEMM + lin_block.colwise() += l1x1_b_vec; + } } } @@ -606,14 +685,30 @@ void A2FastModel::_layer_forward_k(Layer& L, const float* cond, int nu // For the A2 shape the detector only admits K in {6, 15}; any other value // here means something passed the detector that shouldn't have. template -void A2FastModel::_layer_forward(int layer_idx, const float* cond, int num_frames) +void A2FastModel::_layer_forward(int layer_idx, const float* cond, int num_frames, bool is_first, + bool is_last, int head_wp) { Layer& L = _layers[layer_idx]; _ring_write(L, num_frames); + // is_first and is_last are mutually exclusive (>1 layer), so three combos. switch (L.kernel_size) { - case 6: _layer_forward_k<6>(L, cond, num_frames); break; - case 15: _layer_forward_k<15>(L, cond, num_frames); break; + case 6: + if (is_first) + _layer_forward_k<6, true, false>(L, cond, num_frames, head_wp); + else if (is_last) + _layer_forward_k<6, false, true>(L, cond, num_frames, head_wp); + else + _layer_forward_k<6, false, false>(L, cond, num_frames, head_wp); + break; + case 15: + if (is_first) + _layer_forward_k<15, true, false>(L, cond, num_frames, head_wp); + else if (is_last) + _layer_forward_k<15, false, true>(L, cond, num_frames, head_wp); + else + _layer_forward_k<15, false, false>(L, cond, num_frames, head_wp); + break; default: throw std::runtime_error("A2FastModel: unexpected kernel_size " + std::to_string(L.kernel_size)); } } @@ -621,10 +716,11 @@ void A2FastModel::_layer_forward(int layer_idx, const float* cond, int // ----------------------------------------------------------------------------- // Head: K=16 dilation-1 conv from Channels to 1, plus bias + scale. // ----------------------------------------------------------------------------- +// process() writes all 23 layers' activations directly into _head_history +// before calling this function, and has already advanced _head_write_pos. template void A2FastModel::_head_forward(float* output, int num_frames) { - _head_ring_write(num_frames); #if NAM_A2_RING_MODE == 1 const int mask = _head_pow2_mask; auto col_of = [&](int f, int k) { return (_head_write_pos - num_frames + f - (kHeadKernelSize - 1 - k)) & mask; }; @@ -672,11 +768,35 @@ void A2FastModel::process(NAM_SAMPLE** input, NAM_SAMPLE** output, int lin[c] = _rechannel_w[c] * x; } - // Zero head accumulator. - std::memset(_head_sum.data(), 0, static_cast(num_frames) * Channels * sizeof(float)); + // Advance the head ring write position. Rewind if writing num_frames more + // would overflow the contiguous range, preserving the K-1 lookback window. + const int head_keep = kHeadKernelSize - 1; + #if NAM_A2_RING_MODE == 1 + const int head_cap = _head_pow2_size; + #else + const int head_cap = _head_history_cols; + #endif + if (_head_write_pos + num_frames > head_cap) + { + std::memmove(_head_history.data(), + _head_history.data() + static_cast(_head_write_pos - head_keep) * Channels, + static_cast(head_keep) * Channels * sizeof(float)); + _head_write_pos = head_keep; + } + const int head_wp = _head_write_pos; + + // Layer forward: layer 0 assigns into _head_history (IsFirst=true), + // layers 1-22 accumulate (IsFirst=false). No separate _head_sum buffer needed. + _layer_forward(0, cond, num_frames, /*is_first=*/true, /*is_last=*/false, head_wp); + for (int li = 1; li < kNumLayers; li++) + _layer_forward(li, cond, num_frames, /*is_first=*/false, /*is_last=*/li == kNumLayers - 1, head_wp); - for (int li = 0; li < kNumLayers; li++) - _layer_forward(li, cond, num_frames); + // Advance the write position past this buffer's worth of frames. + #if NAM_A2_RING_MODE == 1 + _head_write_pos = (head_wp + num_frames) & _head_pow2_mask; + #else + _head_write_pos = head_wp + num_frames; + #endif // Output. float* head_out = _head_out.data();