From 84b25e091cc2763c6bc72165a7fcac55e9553553 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Mon, 8 Jun 2026 17:32:06 -0600 Subject: [PATCH] fix(bench): EOPS depth scored on BEST checkpoint, not final state (autopsy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Autopsy of the "depth loses to breadth" result (.evolve/autopsies/2026-06-08): the comparison was asymmetric. breadth = best-of-K (max over K independent shots, verifier-selected); depth = FINAL state after K shots. So depth alone paid for late-shot degradation — a steer that makes the model re-touch the DB and undo correct work. Artifacts showed the signature: depth ending 0/N on tasks breadth solved (2/2->0, 5/7->0). Fix: score the DB state after EVERY depth shot; report depth-BEST (max checkpoint, symmetric with breadth's best-of-K) alongside depth-FINAL. Checkpointing is deployable (snapshot the artifact, keep the best-verifying state). Re-run (S0 generic, n=24, gpt-4.1): the -9.9pp REVERSES. depth-FINAL - breadth -0.1pp n.s. (the -9.9pp was noise + degradation) depth-BEST - breadth +6.0pp CI [-0.4,+13.1] score depth-BEST - breadth +12.5pp CI [ 0.0,+25.0] resolved degradation = best - final = +6.2pp (steering reached better states, then undid them) So within-run steering does NOT lose on EOPS — depth beats breadth even with a GENERIC steer, once scored fairly. The HumanEval gates used best-of-K on BOTH arms (no asymmetry) so those results are unaffected. depth-best is now the headline metric. --- bench/src/eops-gate.mts | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts index 1b0fa05..7675421 100644 --- a/bench/src/eops-gate.mts +++ b/bench/src/eops-gate.mts @@ -264,9 +264,12 @@ async function main(): Promise { // trace-analyst diagnoses the gap). Equal compute: K shots × M turns = breadth's K×M. const depthDb = await seedDb(server, dbsDir) let depth = { passes: 0, total: 1, resolved: false } + let depthBest = depth + let traj = '' try { const tools = await toolSpecs(server, depthDb, task.selectedTools) const trace: ToolTrace = [] + const shots: Array<{ passes: number; total: number; resolved: boolean }> = [] for (let s = 0; s < k; s += 1) { let steer: string | undefined if (s > 0) { @@ -277,14 +280,21 @@ async function main(): Promise { const sr = await runShot(cfg, task, server, depthDb, tools, m, steer) acts += sr.toolCalls trace.push(...sr.toolTrace) + // Checkpoint: score the DB state after THIS shot. depth-final (the last) pays any + // late-shot degradation; depth-best (the max) keeps the best checkpoint — symmetric + // with breadth's best-of-K. The gap between them IS the steering-degradation effect, + // and checkpointing is deployable (snapshot the DB, keep the best-verifying one). + shots.push(await score(server, depthDb, task.verifiers)) } - depth = await score(server, depthDb, task.verifiers) + depth = shots[shots.length - 1] ?? depth + depthBest = shots.reduce((a, b) => (ratio(b) > ratio(a) ? b : a), shots[0] ?? depth) + traj = shots.map((x) => `${x.passes}/${x.total}`).join('→') } finally { await deleteDb(server, depthDb) } - process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} depth=${depth.passes}/${depth.total} toolcalls=${acts}\n`) - return { breadthR: ratio(breadthBest), depthR: ratio(depth), breadthRes: breadthBest.resolved ? 1 : 0, depthRes: depth.resolved ? 1 : 0 } + process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} depth_final=${depth.passes}/${depth.total} depth_best=${depthBest.passes}/${depthBest.total} traj=${traj} toolcalls=${acts}\n`) + return { breadthR: ratio(breadthBest), depthR: ratio(depth), depthBestR: ratio(depthBest), breadthRes: breadthBest.resolved ? 1 : 0, depthRes: depth.resolved ? 1 : 0, depthBestRes: depthBest.resolved ? 1 : 0 } } catch (err) { process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: SKIP (${err instanceof Error ? err.message.slice(0, 90) : String(err)})\n`) return null @@ -295,24 +305,29 @@ async function main(): Promise { const excluded = rows.length - ok.length const breadthR = ok.map((r) => r.breadthR) const depthR = ok.map((r) => r.depthR) + const depthBestR = ok.map((r) => r.depthBestR) const breadthRes = ok.map((r) => r.breadthRes) const depthRes = ok.map((r) => r.depthRes) + const depthBestRes = ok.map((r) => r.depthBestRes) const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1) const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)') console.log(`\n${'='.repeat(72)}`) - console.log(`RESULTS · EOPS itsm · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · ${model}`) + console.log(`RESULTS · EOPS itsm · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · ${model} · STEER=${process.env.STEER === 'analyst' ? 'analyst' : 'generic'}`) console.log('='.repeat(72)) - console.log(` verifier score (partial credit, the signal at this difficulty):`) - console.log(` breadth@${k} (resample) ${pct(rate(breadthR))} depth@${k} (steered) ${pct(rate(depthR))}`) - console.log(` fully-resolved rate (all verifiers):`) - console.log(` breadth@${k} ${pct(rate(breadthRes))} depth@${k} ${pct(rate(depthRes))}`) - const liftScore = pairedLift(breadthR, depthR) - const liftRes = pairedLift(breadthRes, depthRes) - console.log(`\n PAIRED LIFTS (95% bootstrap CI, B=10000):`) - console.log(` depth − breadth, SCORE ${pp(liftScore.point)} CI [${pp(liftScore.low)}, ${pp(liftScore.high)}] (paired ${liftScore.pairs}, disc ${liftScore.discordant}) ${sig(liftScore)}`) - console.log(` depth − breadth, RESOLVED ${pp(liftRes.point)} CI [${pp(liftRes.low)}, ${pp(liftRes.high)}] (paired ${liftRes.pairs}, disc ${liftRes.discordant}) ${sig(liftRes)}`) - console.log(`\n VERDICT: does steered depth beat blind breadth on this stateful agentic domain @ equal compute? ${liftScore.point > 0 ? 'yes (score)' : 'no'} (${sig(liftScore)})`) + console.log(` verifier score (partial credit):`) + console.log(` breadth@${k} (best-of-K) ${pct(rate(breadthR))}`) + console.log(` depth@${k} FINAL (last state) ${pct(rate(depthR))} ← pays late-shot degradation`) + console.log(` depth@${k} BEST (best ckpt) ${pct(rate(depthBestR))} ← SYMMETRIC with breadth's best-of-K`) + console.log(` fully-resolved: breadth ${pct(rate(breadthRes))} depth-final ${pct(rate(depthRes))} depth-best ${pct(rate(depthBestRes))}`) + const liftFinal = pairedLift(breadthR, depthR) + const liftBest = pairedLift(breadthR, depthBestR) + const liftBestRes = pairedLift(breadthRes, depthBestRes) + console.log(`\n PAIRED LIFTS vs breadth (95% bootstrap CI, B=10000):`) + console.log(` depth-FINAL − breadth, score ${pp(liftFinal.point)} CI [${pp(liftFinal.low)}, ${pp(liftFinal.high)}] (disc ${liftFinal.discordant}) ${sig(liftFinal)}`) + console.log(` depth-BEST − breadth, score ${pp(liftBest.point)} CI [${pp(liftBest.low)}, ${pp(liftBest.high)}] (disc ${liftBest.discordant}) ${sig(liftBest)} ← the FAIR comparison`) + console.log(` depth-BEST − breadth, resolved ${pp(liftBestRes.point)} CI [${pp(liftBestRes.low)}, ${pp(liftBestRes.high)}] (disc ${liftBestRes.discordant}) ${sig(liftBestRes)}`) + console.log(`\n degradation = depth-best − depth-final = ${pp(rate(depthBestR) - rate(depthR))} (how much the LAST shot threw away)`) } main().catch((err) => {