feat(compute): T2.3 pre-allocate workspace buffers at UploadWeights to avoid capture-time alloc

dndungu · dndungu · commit 9f9eb5c9f796 · 2026-04-16T09:29:56.000-07:00
Add preAllocateWorkspaces() that eagerly initializes the FP8 scratchpad
(scaleOne pointer + struct) and cuBLASLt handle at the end of
UploadWeights, before any CUDA graph capture region begins. These two
objects previously used lazy initialization (getFP8Scratch, getLtHandle)
which triggered cudaMalloc on first use -- hanging silently on GB10 when
first use happened inside capture.

Also add captureAllocCount atomic counter to track allocWeight attempts
during active capture. EndCapture resets the counter and logs a warning
if non-zero. CaptureAllocCount() exposes the counter for testing.
diff --git a/compute/gpu_engine.go b/compute/gpu_engine.go
@@ -87,6 +87,12 @@ type GPUEngine[T tensor.Numeric] struct {
 	// when cuBLAS receives very large matrices (e.g., 128256x4096 LM head).
 	// Default: DefaultMaxAllocBytes (4 GB).
 	maxAllocBytes int64
+
+	// captureAllocCount tracks allocWeight calls that occur during an active
+	// CUDA graph capture. A properly pre-allocated workload should see zero.
+	// Incremented atomically in allocWeight when capture is detected;
+	// checked and reset in EndCapture.
+	captureAllocCount atomic.Int64
 }
 
 // NewGPUEngine creates a new GPUEngine backed by CUDA via the GRAL abstraction.
@@ -570,6 +576,10 @@ func (e *GPUEngine[T]) UploadWeights(tensors []*tensor.TensorNumeric[float32]) e
 			"device", fmt.Sprintf("%d", e.deviceID),
 			"method", method)
 	}
+	// Pre-allocate all workspace buffers that would otherwise be lazily
+	// initialized on first use. This ensures no cudaMalloc occurs inside
+	// a subsequent CUDA graph capture region.
+	e.preAllocateWorkspaces()
 	return nil
 }
 
@@ -638,6 +648,7 @@ func (e *GPUEngine[T]) allocWeight(byteSize int) (unsafe.Pointer, error) {
 		return mallocAsyncFn(byteSize, s)
 	}
 	if err := e.ensureNotCapturing(); err != nil {
+		e.captureAllocCount.Add(1)
 		return nil, err
 	}
 	if e.managedMem {
@@ -714,6 +725,10 @@ func (e *GPUEngine[T]) EndCapture() (GraphHandle, error) {
 	if cap, ok := e.pool.(gpuapi.CaptureAwareAllocator); ok {
 		defer cap.ClearCaptureStream()
 	}
+	if n := e.captureAllocCount.Swap(0); n > 0 {
+		e.logger.Warn("allocWeight called during capture",
+			"count", fmt.Sprintf("%d", n))
+	}
 	s := cuda.StreamFromPtr(e.Stream())
 	graph, err := streamEndCaptureFn(s)
 	if err != nil {
@@ -819,6 +834,48 @@ func (e *GPUEngine[T]) Close() error {
 	return firstErr
 }
 
+// CaptureAllocCount returns the cumulative number of allocWeight calls that
+// were attempted while a CUDA graph capture was active. A properly
+// pre-allocated workload should observe zero after EndCapture.
+func (e *GPUEngine[T]) CaptureAllocCount() int64 {
+	return e.captureAllocCount.Load()
+}
+
+// preAllocateWorkspaces eagerly initializes all lazy-allocated workspace
+// buffers so that no cudaMalloc occurs inside a CUDA graph capture region.
+// Called at the end of UploadWeights, after all weight tensors are on GPU.
+//
+// For dense float32 workloads, pool.Alloc (arena-backed) is capture-safe via
+// CaptureAwareAllocator, but objects allocated outside the arena — the FP8
+// scratchpad and the cuBLASLt handle — use cudaMalloc and would hang if first
+// touched during capture on GB10.
+func (e *GPUEngine[T]) preAllocateWorkspaces() {
+	// 1. FP8 scratchpad: allocate scaleOne and the struct itself so that the
+	//    first FP8 MatMul during capture does not trigger cudaMalloc.
+	if e.fp8Scratch == nil {
+		if s, err := e.getFP8Scratch(); err != nil {
+			e.logger.Warn("preAllocateWorkspaces: FP8 scratchpad init failed",
+				"error", err.Error())
+		} else {
+			_ = s // assigned to e.fp8Scratch inside getFP8Scratch
+		}
+	}
+
+	// 2. cuBLASLt handle: cublasLtCreate allocates internal CUDA state.
+	if e.ltHandle == nil {
+		if h, err := e.getLtHandle(); err != nil {
+			e.logger.Warn("preAllocateWorkspaces: cuBLASLt handle init failed",
+				"error", err.Error())
+		} else {
+			_ = h // assigned to e.ltHandle inside getLtHandle
+		}
+	}
+
+	e.logger.Info("workspace buffers pre-allocated",
+		"fp8Scratch", fmt.Sprintf("%v", e.fp8Scratch != nil),
+		"ltHandle", fmt.Sprintf("%v", e.ltHandle != nil))
+}
+
 // OOMFallbackCount returns the number of times GPU OOM triggered CPU fallback.
 func (e *GPUEngine[T]) OOMFallbackCount() int64 {
 	return e.oomFallbackCount.Load()
diff --git a/compute/workspace_prealloc_test.go b/compute/workspace_prealloc_test.go
@@ -0,0 +1,205 @@
+package compute
+
+import (
+	"errors"
+	"testing"
+
+	"github.com/zerfoo/ztensor/internal/cuda"
+	"github.com/zerfoo/ztensor/log"
+	"github.com/zerfoo/ztensor/numeric"
+	"github.com/zerfoo/ztensor/tensor"
+)
+
+// TestPreAllocateWorkspaces_FP8ScratchInitialized verifies that after
+// UploadWeights, the FP8 scratchpad is non-nil (eagerly initialized).
+func TestPreAllocateWorkspaces_FP8ScratchInitialized(t *testing.T) {
+	eng := newPreallocEngine(t)
+	if eng.fp8Scratch != nil {
+		t.Fatal("precondition: fp8Scratch should be nil before UploadWeights")
+	}
+
+	if err := eng.UploadWeights(nil); err != nil {
+		t.Fatalf("UploadWeights: %v", err)
+	}
+
+	if eng.fp8Scratch == nil {
+		t.Fatal("fp8Scratch should be non-nil after UploadWeights")
+	}
+	if eng.fp8Scratch.scaleOne == nil {
+		t.Fatal("fp8Scratch.scaleOne should be non-nil after pre-allocation")
+	}
+}
+
+// TestPreAllocateWorkspaces_CalledByUploadWeights verifies that
+// preAllocateWorkspaces fires at the end of UploadWeights even when
+// called with an empty weight list (the pre-allocation is unconditional).
+func TestPreAllocateWorkspaces_CalledByUploadWeights(t *testing.T) {
+	eng := newPreallocEngine(t)
+
+	if err := eng.UploadWeights([]*tensor.TensorNumeric[float32]{}); err != nil {
+		t.Fatalf("UploadWeights: %v", err)
+	}
+
+	if eng.fp8Scratch == nil {
+		t.Fatal("fp8Scratch should be non-nil after UploadWeights")
+	}
+	if eng.fp8Scratch.scaleOne == nil {
+		t.Fatal("fp8Scratch.scaleOne should be non-nil after pre-allocation")
+	}
+}
+
+// TestPreAllocateWorkspaces_TableDriven exercises workspace pre-allocation
+// with varying weight list sizes. Pre-allocation is unconditional, so
+// fp8Scratch should be non-nil regardless of weight count.
+func TestPreAllocateWorkspaces_TableDriven(t *testing.T) {
+	tests := []struct {
+		name       string
+		numWeights int
+	}{
+		{name: "no weights", numWeights: 0},
+		{name: "one nil entry", numWeights: 1},
+		{name: "three nil entries", numWeights: 3},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			eng := newPreallocEngine(t)
+			pool := eng.pool.(*fakeMemPool)
+
+			// Pass nil tensor entries -- UploadWeights skips them.
+			weights := make([]*tensor.TensorNumeric[float32], tt.numWeights)
+			if err := eng.UploadWeights(weights); err != nil {
+				t.Fatalf("UploadWeights: %v", err)
+			}
+
+			if eng.fp8Scratch == nil {
+				t.Error("fp8Scratch should be non-nil after UploadWeights")
+			}
+			if eng.fp8Scratch.scaleOne == nil {
+				t.Error("fp8Scratch.scaleOne should be non-nil")
+			}
+			// scaleOne alloc is the minimum: 1 pool.Alloc from getFP8Scratch.
+			if pool.allocCount < 1 {
+				t.Errorf("expected at least 1 alloc from pre-allocation, got %d", pool.allocCount)
+			}
+		})
+	}
+}
+
+// TestCaptureAllocCount_ZeroAfterPrealloc verifies that captureAllocCount
+// stays at zero when allocWeight is not called during capture. This is the
+// expected state for a properly pre-allocated workload.
+func TestCaptureAllocCount_ZeroAfterPrealloc(t *testing.T) {
+	eng := newPreallocEngine(t)
+	if err := eng.UploadWeights(nil); err != nil {
+		t.Fatalf("UploadWeights: %v", err)
+	}
+
+	if got := eng.CaptureAllocCount(); got != 0 {
+		t.Fatalf("CaptureAllocCount after UploadWeights: got %d, want 0", got)
+	}
+}
+
+// TestCaptureAllocCount_IncrementsOnCaptureTimeAlloc verifies that
+// allocWeight increments captureAllocCount when capture is active.
+func TestCaptureAllocCount_IncrementsOnCaptureTimeAlloc(t *testing.T) {
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		return cuda.CaptureStatusActive, nil
+	})
+	defer restore()
+
+	eng := &GPUEngine[float32]{stream: fakePtrStream{}}
+
+	// First attempt — should fail with capture sentinel and increment counter.
+	_, err := eng.allocWeight(4096)
+	if !errors.Is(err, ErrCaptureIncompatibleAllocation) {
+		t.Fatalf("allocWeight: expected ErrCaptureIncompatibleAllocation, got %v", err)
+	}
+
+	if got := eng.CaptureAllocCount(); got != 1 {
+		t.Fatalf("CaptureAllocCount after 1 attempt: got %d, want 1", got)
+	}
+
+	// Second attempt — count should increase.
+	_, _ = eng.allocWeight(8192)
+	if got := eng.CaptureAllocCount(); got != 2 {
+		t.Fatalf("CaptureAllocCount after 2 attempts: got %d, want 2", got)
+	}
+}
+
+// TestCaptureAllocCount_ResetByEndCapture verifies that EndCapture resets
+// the captureAllocCount to zero after logging.
+func TestCaptureAllocCount_ResetByEndCapture(t *testing.T) {
+	// Arrange: inject a capture-active status for allocWeight, then swap to
+	// a non-capture status for EndCapture.
+	captureActive := true
+	restore := swapCaptureStatusFn(func(_ *cuda.Stream) (cuda.CaptureStatus, error) {
+		if captureActive {
+			return cuda.CaptureStatusActive, nil
+		}
+		return cuda.CaptureStatusNone, nil
+	})
+	defer restore()
+
+	eng := &GPUEngine[float32]{
+		stream: fakePtrStream{},
+		logger: log.Nop(),
+	}
+
+	// Trigger two allocWeight attempts during capture.
+	_, _ = eng.allocWeight(4096)
+	_, _ = eng.allocWeight(8192)
+	if got := eng.CaptureAllocCount(); got != 2 {
+		t.Fatalf("CaptureAllocCount before EndCapture: got %d, want 2", got)
+	}
+
+	// EndCapture will fail (no real graph) but should still reset the counter.
+	captureActive = false
+	oldEnd := streamEndCaptureFn
+	streamEndCaptureFn = func(_ *cuda.Stream) (*cuda.Graph, error) {
+		return nil, errors.New("synthetic: no graph")
+	}
+	defer func() { streamEndCaptureFn = oldEnd }()
+
+	_, _ = eng.EndCapture()
+
+	if got := eng.CaptureAllocCount(); got != 0 {
+		t.Fatalf("CaptureAllocCount after EndCapture: got %d, want 0", got)
+	}
+}
+
+// TestPreAllocateWorkspaces_Idempotent verifies that calling
+// preAllocateWorkspaces multiple times does not leak or double-allocate.
+func TestPreAllocateWorkspaces_Idempotent(t *testing.T) {
+	eng := newPreallocEngine(t)
+	pool := eng.pool.(*fakeMemPool)
+
+	eng.preAllocateWorkspaces()
+	allocsAfterFirst := pool.allocCount
+
+	eng.preAllocateWorkspaces()
+	allocsAfterSecond := pool.allocCount
+
+	if allocsAfterSecond != allocsAfterFirst {
+		t.Fatalf("second preAllocateWorkspaces caused %d new allocs, want 0",
+			allocsAfterSecond-allocsAfterFirst)
+	}
+}
+
+// newPreallocEngine builds a GPUEngine suitable for testing workspace
+// pre-allocation without real CUDA hardware.
+func newPreallocEngine(t *testing.T) *GPUEngine[float32] {
+	t.Helper()
+	pool := newFakeMemPool()
+	return &GPUEngine[float32]{
+		cpu:           NewCPUEngine[float32](numeric.Float32Ops{}),
+		runtime:       fakeRuntime{},
+		pool:          pool,
+		stream:        fakeStream{},
+		logger:        log.Nop(),
+		deviceID:      0,
+		dtype:         DTypeF32,
+		maxAllocBytes: DefaultMaxAllocBytes,
+	}
+}
+