From 1a222436de7df01a36f3098b13d0602c47df3b02 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Mon, 8 Jun 2026 17:14:22 -0600 Subject: [PATCH] =?UTF-8?q?feat(bench):=20EOPS=20analyst-steered=20depth?= =?UTF-8?q?=20(S1)=20=E2=80=94=20de-confound=20steer=20quality?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The n=24 EOPS gate showed GENERIC-steered depth losing to breadth (-9.9pp). But the prior depth-WIN (+13.4pp) used a trace-analyst on a different harness — so steer type, harness, and n were all confounded. This isolates the steerer: same eops-gate harness, same domain, same n, same breadth control — only STEER varies. - routerToolLoop now returns toolTrace (each tool call + result) — what a trace analyst reads (behavior, never the verdict). - STEER=analyst wires an inline agent-eval-style trace-analyst as the depth steer: it reads the agent's tool-call trace, diagnoses the remaining gap, and issues one concrete corrective instruction. FIREWALLED — never sees the verifiers/expected values. STEER=generic keeps the fixed nudge (the -9.9pp control). The decisive comparison: depth@analyst vs breadth, vs depth@generic vs breadth. If the analyst flips depth from significantly-losing to winning where the generic steer lost — same harness/n/domain — steer QUALITY is the operative variable, not depth. Maps to the repo's "analyst = 3 runtimes" F3 (inline / Halo-cli / sandboxed-fanout). --- bench/src/eops-gate.mts | 53 +++++++++++++++++++++++++++++++------- bench/src/router-client.ts | 9 +++++-- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts index 14c09e0..1b0fa05 100644 --- a/bench/src/eops-gate.mts +++ b/bench/src/eops-gate.mts @@ -22,7 +22,7 @@ */ import { readFileSync } from 'node:fs' import { join } from 'node:path' -import { type RouterConfig, type ToolSpec, routerToolLoop } from './router-client' +import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client' import { type PairedLift, pairedLift, pool } from './stats.mts' function must(name: string): string { @@ -183,7 +183,9 @@ function shotPrompt(task: EopsTask, steer?: string): string { ].join('\n') } -async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbId: string, tools: ToolSpec[], maxTurns: number, steer?: string): Promise { +type ToolTrace = Array<{ name: string; args: string; result: string }> + +async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbId: string, tools: ToolSpec[], maxTurns: number, steer?: string): Promise<{ toolCalls: number; toolTrace: ToolTrace }> { const r = await routerToolLoop( cfg, task.systemPrompt || 'You are an IT service-management operations agent.', @@ -192,7 +194,30 @@ async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbI async (name, args) => callTool(server, dbId, name, args as Record), { maxTurns, temperature: 0.3 }, ) - return r.toolCalls + return { toolCalls: r.toolCalls, toolTrace: r.toolTrace } +} + +/** S1 steerer — the inline agent-eval-style trace-analyst. FIREWALLED: it reads the + * agent's tool-call trace (behavior), NEVER the verifiers or their expected values. + * Diagnoses the remaining gap and issues one concrete corrective instruction. */ +async function analystSteer(cfg: RouterConfig, task: EopsTask, trace: ToolTrace): Promise { + const summary = trace.map((t) => `${t.name}(${t.args.slice(0, 140)}) -> ${t.result.slice(0, 180)}`).join('\n').slice(-4000) + const r = await routerChatWithUsage( + cfg, + [ + { + role: 'system', + content: + 'You are a senior ITSM operations reviewer. You are shown an agent\'s tool-call trace on a task it has NOT completed. Diagnose precisely what the task still requires and issue ONE concrete corrective instruction — name the specific records, fields, and target values to set. Do not restate the task, do not praise, do not summarize the trace. Output only the single next instruction.', + }, + { + role: 'user', + content: `TASK:\n${task.userPrompt}\n\nAGENT TRACE SO FAR:\n${summary || '(no tool calls yet)'}\n\nThe single most important still-missing or incorrect step, as one concrete instruction:`, + }, + ], + { temperature: 0.2 }, + ) + return r.content.trim() } const pct = (x: number) => `${(x * 100).toFixed(1)}%` @@ -207,8 +232,9 @@ async function main(): Promise { const dbsDir = must('EOPS_GYM_DBS_DIR') const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model } const concurrency = Number(process.env.CONCURRENCY ?? 4) + const steerMode = process.env.STEER === 'analyst' ? 'analyst' : 'generic' - console.log(`=== EOPS depth-vs-breadth gate · tool-using router worker · N=${n} K=${k} M=${m} model=${model} ===`) + console.log(`=== EOPS depth-vs-breadth gate · tool-using router worker · N=${n} K=${k} M=${m} model=${model} · STEER=${steerMode} ===`) const tasks = await loadTasks(n, offset) console.log(`loaded ${tasks.length} itsm task(s); breadth@${k} (resample, K×M turns) vs depth@${k} (one loop, ${k * m} turns), conc=${concurrency}\n`) @@ -224,7 +250,7 @@ async function main(): Promise { const dbId = await seedDb(server, dbsDir) try { const tools = await toolSpecs(server, dbId, task.selectedTools) - acts += await runShot(cfg, task, server, dbId, tools, m) + acts += (await runShot(cfg, task, server, dbId, tools, m)).toolCalls breadthScores.push(await score(server, dbId, task.verifiers)) } finally { await deleteDb(server, dbId) @@ -233,15 +259,24 @@ async function main(): Promise { const breadthBest = breadthScores.reduce((a, b) => (ratio(b) > ratio(a) ? b : a), breadthScores[0] ?? { passes: 0, total: 1, resolved: false }) // depth@K: K SEQUENTIAL shots over ONE persistent DB — the artifact accumulates and - // each re-engagement is steered to finish what's left (the regime where depth won - // before). Equal compute: K shots × M turns, same as breadth's K × M. + // each re-engagement is STEERED to finish what's left. The steer is the variable + // under test: STEER=generic (a fixed nudge) vs STEER=analyst (the firewalled + // trace-analyst diagnoses the gap). Equal compute: K shots × M turns = breadth's K×M. const depthDb = await seedDb(server, dbsDir) let depth = { passes: 0, total: 1, resolved: false } try { const tools = await toolSpecs(server, depthDb, task.selectedTools) + const trace: ToolTrace = [] for (let s = 0; s < k; s += 1) { - const steer = s === 0 ? undefined : 'Re-inspect the current state with the read tools, identify what the task still requires, and complete it. Do not stop until every required change is verified in place.' - acts += await runShot(cfg, task, server, depthDb, tools, m, steer) + let steer: string | undefined + if (s > 0) { + steer = steerMode === 'analyst' + ? await analystSteer(cfg, task, trace) + : 'Re-inspect the current state with the read tools, identify what the task still requires, and complete it. Do not stop until every required change is verified in place.' + } + const sr = await runShot(cfg, task, server, depthDb, tools, m, steer) + acts += sr.toolCalls + trace.push(...sr.toolTrace) } depth = await score(server, depthDb, task.verifiers) } finally { diff --git a/bench/src/router-client.ts b/bench/src/router-client.ts index d1b20e8..b1b0a54 100644 --- a/bench/src/router-client.ts +++ b/bench/src/router-client.ts @@ -150,6 +150,9 @@ export interface RouterToolLoopResult { /** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */ turns: number toolCalls: number + /** The behavior trace: each tool call + its result, in order. What a trace-analyst + * steerer reads (behavior, never the verdict) to diagnose + redirect the next shot. */ + toolTrace: Array<{ name: string; args: string; result: string }> usage: { input: number; output: number } } @@ -180,6 +183,7 @@ export async function routerToolLoop( let toolCalls = 0 let lastText = '' const usage = { input: 0, output: 0 } + const toolTrace: Array<{ name: string; args: string; result: string }> = [] for (let turn = 1; turn <= maxTurns; turn += 1) { const r = await routerChatWithTools(cfg, messages, tools, { @@ -191,7 +195,7 @@ export async function routerToolLoop( usage.output += r.usage.output } if (r.content) lastText = r.content - if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, usage } + if (r.toolCalls.length === 0) return { final: lastText, turns: turn, toolCalls, toolTrace, usage } // Record the assistant turn verbatim (content + the tool_calls it requested), then // run each call on the host and fold the result back as a `tool` message. @@ -213,7 +217,8 @@ export async function routerToolLoop( } const out = await execute(tc.name, args) messages.push({ role: 'tool', tool_call_id: tc.id, content: out }) + toolTrace.push({ name: tc.name, args: tc.arguments, result: out }) } } - return { final: lastText, turns: maxTurns, toolCalls, usage } + return { final: lastText, turns: maxTurns, toolCalls, toolTrace, usage } }