From 84b25e091cc2763c6bc72165a7fcac55e9553553 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Mon, 8 Jun 2026 17:32:06 -0600
Subject: [PATCH] fix(bench): EOPS depth scored on BEST checkpoint, not final
 state (autopsy)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Autopsy of the "depth loses to breadth" result (.evolve/autopsies/2026-06-08):
the comparison was asymmetric. breadth = best-of-K (max over K independent shots,
verifier-selected); depth = FINAL state after K shots. So depth alone paid for
late-shot degradation — a steer that makes the model re-touch the DB and undo
correct work. Artifacts showed the signature: depth ending 0/N on tasks breadth
solved (2/2->0, 5/7->0).

Fix: score the DB state after EVERY depth shot; report depth-BEST (max checkpoint,
symmetric with breadth's best-of-K) alongside depth-FINAL. Checkpointing is
deployable (snapshot the artifact, keep the best-verifying state).

Re-run (S0 generic, n=24, gpt-4.1): the -9.9pp REVERSES.
  depth-FINAL - breadth   -0.1pp  n.s.   (the -9.9pp was noise + degradation)
  depth-BEST  - breadth   +6.0pp  CI [-0.4,+13.1]  score
  depth-BEST  - breadth  +12.5pp  CI [ 0.0,+25.0]  resolved
  degradation = best - final = +6.2pp (steering reached better states, then undid them)

So within-run steering does NOT lose on EOPS — depth beats breadth even with a
GENERIC steer, once scored fairly. The HumanEval gates used best-of-K on BOTH arms
(no asymmetry) so those results are unaffected. depth-best is now the headline metric.
---
 bench/src/eops-gate.mts | 43 +++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)
diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts
index 1b0fa05..7675421 100644
--- a/bench/src/eops-gate.mts
+++ b/bench/src/eops-gate.mts
@@ -264,9 +264,12 @@ async function main(): Promise<void> {
     // trace-analyst diagnoses the gap). Equal compute: K shots × M turns = breadth's K×M.
     const depthDb = await seedDb(server, dbsDir)
     let depth = { passes: 0, total: 1, resolved: false }
+    let depthBest = depth
+    let traj = ''
     try {
       const tools = await toolSpecs(server, depthDb, task.selectedTools)
       const trace: ToolTrace = []
+      const shots: Array<{ passes: number; total: number; resolved: boolean }> = []
       for (let s = 0; s < k; s += 1) {
         let steer: string | undefined
         if (s > 0) {
@@ -277,14 +280,21 @@ async function main(): Promise<void> {
         const sr = await runShot(cfg, task, server, depthDb, tools, m, steer)
         acts += sr.toolCalls
         trace.push(...sr.toolTrace)
+        // Checkpoint: score the DB state after THIS shot. depth-final (the last) pays any
+        // late-shot degradation; depth-best (the max) keeps the best checkpoint — symmetric
+        // with breadth's best-of-K. The gap between them IS the steering-degradation effect,
+        // and checkpointing is deployable (snapshot the DB, keep the best-verifying one).
+        shots.push(await score(server, depthDb, task.verifiers))
       }
-      depth = await score(server, depthDb, task.verifiers)
+      depth = shots[shots.length - 1] ?? depth
+      depthBest = shots.reduce((a, b) => (ratio(b) > ratio(a) ? b : a), shots[0] ?? depth)
+      traj = shots.map((x) => `${x.passes}/${x.total}`).join('→')
     } finally {
       await deleteDb(server, depthDb)
     }
 
-    process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} depth=${depth.passes}/${depth.total} toolcalls=${acts}\n`)
-    return { breadthR: ratio(breadthBest), depthR: ratio(depth), breadthRes: breadthBest.resolved ? 1 : 0, depthRes: depth.resolved ? 1 : 0 }
+    process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} depth_final=${depth.passes}/${depth.total} depth_best=${depthBest.passes}/${depthBest.total} traj=${traj} toolcalls=${acts}\n`)
+    return { breadthR: ratio(breadthBest), depthR: ratio(depth), depthBestR: ratio(depthBest), breadthRes: breadthBest.resolved ? 1 : 0, depthRes: depth.resolved ? 1 : 0, depthBestRes: depthBest.resolved ? 1 : 0 }
     } catch (err) {
       process.stderr.write(`  [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: SKIP (${err instanceof Error ? err.message.slice(0, 90) : String(err)})\n`)
       return null
@@ -295,24 +305,29 @@ async function main(): Promise<void> {
   const excluded = rows.length - ok.length
   const breadthR = ok.map((r) => r.breadthR)
   const depthR = ok.map((r) => r.depthR)
+  const depthBestR = ok.map((r) => r.depthBestR)
   const breadthRes = ok.map((r) => r.breadthRes)
   const depthRes = ok.map((r) => r.depthRes)
+  const depthBestRes = ok.map((r) => r.depthBestRes)
   const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1)
   const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)')
 
   console.log(`\n${'='.repeat(72)}`)
-  console.log(`RESULTS · EOPS itsm · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · ${model}`)
+  console.log(`RESULTS · EOPS itsm · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · ${model} · STEER=${process.env.STEER === 'analyst' ? 'analyst' : 'generic'}`)
   console.log('='.repeat(72))
-  console.log(`  verifier score (partial credit, the signal at this difficulty):`)
-  console.log(`    breadth@${k} (resample) ${pct(rate(breadthR))}    depth@${k} (steered) ${pct(rate(depthR))}`)
-  console.log(`  fully-resolved rate (all verifiers):`)
-  console.log(`    breadth@${k} ${pct(rate(breadthRes))}    depth@${k} ${pct(rate(depthRes))}`)
-  const liftScore = pairedLift(breadthR, depthR)
-  const liftRes = pairedLift(breadthRes, depthRes)
-  console.log(`\n  PAIRED LIFTS (95% bootstrap CI, B=10000):`)
-  console.log(`    depth − breadth, SCORE     ${pp(liftScore.point)}   CI [${pp(liftScore.low)}, ${pp(liftScore.high)}]   (paired ${liftScore.pairs}, disc ${liftScore.discordant})  ${sig(liftScore)}`)
-  console.log(`    depth − breadth, RESOLVED  ${pp(liftRes.point)}   CI [${pp(liftRes.low)}, ${pp(liftRes.high)}]   (paired ${liftRes.pairs}, disc ${liftRes.discordant})  ${sig(liftRes)}`)
-  console.log(`\n  VERDICT: does steered depth beat blind breadth on this stateful agentic domain @ equal compute? ${liftScore.point > 0 ? 'yes (score)' : 'no'} (${sig(liftScore)})`)
+  console.log(`  verifier score (partial credit):`)
+  console.log(`    breadth@${k} (best-of-K)      ${pct(rate(breadthR))}`)
+  console.log(`    depth@${k} FINAL  (last state) ${pct(rate(depthR))}      ← pays late-shot degradation`)
+  console.log(`    depth@${k} BEST   (best ckpt)  ${pct(rate(depthBestR))}      ← SYMMETRIC with breadth's best-of-K`)
+  console.log(`  fully-resolved: breadth ${pct(rate(breadthRes))}  depth-final ${pct(rate(depthRes))}  depth-best ${pct(rate(depthBestRes))}`)
+  const liftFinal = pairedLift(breadthR, depthR)
+  const liftBest = pairedLift(breadthR, depthBestR)
+  const liftBestRes = pairedLift(breadthRes, depthBestRes)
+  console.log(`\n  PAIRED LIFTS vs breadth (95% bootstrap CI, B=10000):`)
+  console.log(`    depth-FINAL − breadth, score  ${pp(liftFinal.point)}   CI [${pp(liftFinal.low)}, ${pp(liftFinal.high)}]   (disc ${liftFinal.discordant})  ${sig(liftFinal)}`)
+  console.log(`    depth-BEST  − breadth, score  ${pp(liftBest.point)}   CI [${pp(liftBest.low)}, ${pp(liftBest.high)}]   (disc ${liftBest.discordant})  ${sig(liftBest)}   ← the FAIR comparison`)
+  console.log(`    depth-BEST  − breadth, resolved ${pp(liftBestRes.point)}   CI [${pp(liftBestRes.low)}, ${pp(liftBestRes.high)}]   (disc ${liftBestRes.discordant})  ${sig(liftBestRes)}`)
+  console.log(`\n  degradation = depth-best − depth-final = ${pp(rate(depthBestR) - rate(depthR))} (how much the LAST shot threw away)`)
 }
 
 main().catch((err) => {