diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts new file mode 100644 index 0000000..14c09e0 --- /dev/null +++ b/bench/src/eops-gate.mts @@ -0,0 +1,286 @@ +/** + * EnterpriseOps-Gym depth-vs-breadth gate — the agentic, stateful domain where + * steering is hypothesized to beat compute (the opposite regime to HumanEval, + * where breadth/resampling won). The worker is the TOOL-USING router backend + * (`routerToolLoop`): it calls the gym's live MCP tools, sees the results, and + * acts — off-box (router inference + host→gym HTTP), no sandbox. + * + * breadth@K — K independent shots, each a short agentic loop on its OWN fresh + * seeded DB; keep the best by the deployable verifier (resample). + * depth@K — ONE sustained agentic loop over ONE DB, ~K× the turn budget; the + * artifact (DB state) accumulates, so each action conditions the next. + * + * Equal compute = equal total inference turns (K·M). Score = the task's own SQL + * verifiers (deployable check), run on the final DB state. Per-task {0,1} resolved, + * paired 95% bootstrap CI. + * + * Stand up first: + * docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest + * # gym_dbs.zip from github.com/ServiceNow/EnterpriseOps-Gym (root), unzipped: + * export EOPS_GYM_DBS_DIR=/path/to/unzipped/dbs + * TANGLE_API_KEY=… N=20 K=3 M=5 WORKER_MODEL=gpt-4o-mini tsx src/eops-gate.mts + */ +import { readFileSync } from 'node:fs' +import { join } from 'node:path' +import { type RouterConfig, type ToolSpec, routerToolLoop } from './router-client' +import { type PairedLift, pairedLift, pool } from './stats.mts' + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +const dataset = 'ServiceNow-AI/EnterpriseOps-Gym' + +type ComparisonType = 'equals' | 'greater_than' | 'less_than' | 'contains' +interface Verifier { + verifier_type?: string + gym_name?: string + /** EOPS nests the deterministic check here; comparison_type defaults to 'equals'. */ + validation_config?: { query?: string; expected_value?: unknown; comparison_type?: ComparisonType } +} +interface GymServer { + mcp_server_url: string + seed_database_file: string + context?: Record +} +interface EopsTask { + taskId: string + systemPrompt: string + userPrompt: string + selectedTools: string[] + servers: GymServer[] + verifiers: Verifier[] +} + +const asArray = (v: unknown): T[] => (typeof v === 'string' ? JSON.parse(v) : v) as T[] + +/** Pull itsm tasks from the HF rows server (the oracle tool-set config). Fail loud. */ +async function loadTasks(n: number, offset: number): Promise { + const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(dataset)}&config=oracle&split=itsm&offset=${offset}&length=${n}` + const res = await fetch(url) + if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}: ${url}`) + const data = (await res.json()) as { rows?: Array<{ row: Record }> } + const rows = data.rows ?? [] + if (rows.length === 0) throw new Error('EOPS HF returned 0 rows') + return rows.map(({ row }) => ({ + taskId: String(row.task_id), + systemPrompt: String(row.system_prompt ?? ''), + userPrompt: String(row.user_prompt ?? ''), + selectedTools: asArray(row.selected_tools), + servers: asArray(row.gym_servers_config), + verifiers: asArray(row.verifiers), + })) +} + +// ── gym client (mirrors scripts/enterpriseops_gym_judge.py) ──────────────────── + +function authHeaders(server: GymServer, dbId: string): Record { + return { 'content-type': 'application/json', ...(server.context ?? {}), 'x-database-id': dbId } +} + +/** POST and parse a JSON body OR the last `data:` line of an SSE stream (/mcp streams SSE). */ +async function postJson(url: string, body: unknown, headers: Record): Promise<{ status: number; json: unknown }> { + const r = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) }) + const text = await r.text() + const dataLines = text.split('\n').filter((l) => l.startsWith('data:')).map((l) => l.slice(5).trim()) + const payload = dataLines.length ? dataLines[dataLines.length - 1] : text + try { + return { status: r.status, json: JSON.parse(payload ?? 'null') } + } catch { + return { status: r.status, json: text } + } +} + +async function seedDb(server: GymServer, dbsDir: string): Promise { + const dbId = `gate_${Math.random().toString(36).slice(2, 12)}` + const sql = readFileSync(join(dbsDir, server.seed_database_file), 'utf8') + const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/seed-database` + const { status, json } = await postJson(url, { database_id: dbId, name: `gate_${dbId}`, description: 'gate', sql_content: sql }, { 'content-type': 'application/json' }) + if (status !== 200 || !(json as { success?: boolean })?.success) throw new Error(`seed-database failed (${status}): ${JSON.stringify(json).slice(0, 200)}`) + return dbId +} + +async function deleteDb(server: GymServer, dbId: string): Promise { + await fetch(`${server.mcp_server_url.replace(/\/$/, '')}/api/delete-database`, { + method: 'DELETE', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ database_id: dbId }), + }).catch(() => {}) +} + +/** Coerce an MCP inputSchema to an OpenAI-tool-valid top-level object schema. The + * router rejects top-level oneOf/anyOf/allOf/enum/not — keep the properties (nested + * combinators are fine) but guarantee a plain `{type:'object'}` head. */ +function sanitizeSchema(s: unknown): { type: 'object'; properties: Record; required?: string[] } { + const o = s && typeof s === 'object' ? (s as Record) : {} + const banned = o.oneOf || o.anyOf || o.allOf || o.not || o.enum + if (o.type === 'object' && !banned && o.properties && typeof o.properties === 'object') { + return { type: 'object', properties: o.properties as Record, ...(Array.isArray(o.required) ? { required: o.required as string[] } : {}) } + } + return { type: 'object', properties: {} } +} + +/** Build OpenAI-shape tool specs for the task's selected tools from the gym's MCP tools/list. */ +async function toolSpecs(server: GymServer, dbId: string, selected: string[]): Promise { + const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp` + const { json } = await postJson(url, { jsonrpc: '2.0', id: 1, method: 'tools/list', params: {} }, authHeaders(server, dbId)) + const all = ((json as { result?: { tools?: Array<{ name: string; description?: string; inputSchema?: unknown }> } }).result?.tools) ?? [] + const want = new Set(selected) + return all + .filter((t) => want.has(t.name)) + .map((t) => ({ type: 'function' as const, function: { name: t.name, description: (t.description ?? '').slice(0, 1000), parameters: sanitizeSchema(t.inputSchema) } })) +} + +async function callTool(server: GymServer, dbId: string, name: string, args: Record): Promise { + const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp` + const { json } = await postJson(url, { jsonrpc: '2.0', id: 2, method: 'tools/call', params: { name, arguments: args } }, authHeaders(server, dbId)) + const result = (json as { result?: { content?: Array<{ text?: string }>; isError?: boolean }; error?: unknown }) ?? {} + if (result.error) return `error: ${JSON.stringify(result.error).slice(0, 300)}` + const text = result.result?.content?.map((c) => c.text ?? '').join('\n') ?? JSON.stringify(result.result ?? json) + return text.slice(0, 1500) +} + +function compare(actual: unknown, expected: unknown, kind: ComparisonType): boolean { + const fa = Number(actual) + const fe = Number(expected) + const numeric = !Number.isNaN(fa) && !Number.isNaN(fe) + if (kind === 'equals') return numeric ? fa === fe : String(actual) === String(expected) + if (kind === 'greater_than') return numeric && fa > fe + if (kind === 'less_than') return numeric && fa < fe + if (kind === 'contains') return String(actual).includes(String(expected)) + throw new Error(`unsupported comparison_type ${kind}`) +} + +/** Run the task's SQL verifiers on the final DB state; resolved = all pass. */ +async function score(server: GymServer, dbId: string, verifiers: Verifier[]): Promise<{ passes: number; total: number; resolved: boolean }> { + // Only deterministic database_state verifiers are scoreable (the judge rejects others). + const dbv = verifiers.filter((v) => (v.verifier_type ?? 'database_state') === 'database_state' && v.validation_config?.query) + let passes = 0 + for (const v of dbv) { + const vc = v.validation_config as NonNullable + const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/sql-runner` + const { json } = await postJson(url, { query: vc.query, database_id: dbId }, authHeaders(server, dbId)) + const out = json as { data?: Array>; rows?: Array>; error?: unknown } + if (out.error) continue + const first = (out.data ?? out.rows ?? [])[0] + const actual = first && typeof first === 'object' ? Object.values(first)[0] : first + if (compare(actual, vc.expected_value, vc.comparison_type ?? 'equals')) passes += 1 + } + return { passes, total: dbv.length, resolved: dbv.length > 0 && passes === dbv.length } +} + +// ── one agentic shot: the tool-using worker acts on a (seeded) DB ────────────── + +function shotPrompt(task: EopsTask, steer?: string): string { + return [ + task.userPrompt, + '', + 'Use the available tools to investigate the current state, then take the actions needed to complete the task.', + 'Inspect before you mutate. When you are confident the task is complete, give a one-line summary and stop calling tools.', + ...(steer ? ['', `CORRECTION FROM YOUR PRIOR ATTEMPT: ${steer}`] : []), + ].join('\n') +} + +async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbId: string, tools: ToolSpec[], maxTurns: number, steer?: string): Promise { + const r = await routerToolLoop( + cfg, + task.systemPrompt || 'You are an IT service-management operations agent.', + shotPrompt(task, steer), + tools, + async (name, args) => callTool(server, dbId, name, args as Record), + { maxTurns, temperature: 0.3 }, + ) + return r.toolCalls +} + +const pct = (x: number) => `${(x * 100).toFixed(1)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 20) + const k = Number(process.env.K ?? 3) + const m = Number(process.env.M ?? 5) + const offset = Number(process.env.OFFSET ?? 0) + const model = process.env.WORKER_MODEL ?? 'gpt-4o-mini' + const dbsDir = must('EOPS_GYM_DBS_DIR') + const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model } + const concurrency = Number(process.env.CONCURRENCY ?? 4) + + console.log(`=== EOPS depth-vs-breadth gate · tool-using router worker · N=${n} K=${k} M=${m} model=${model} ===`) + const tasks = await loadTasks(n, offset) + console.log(`loaded ${tasks.length} itsm task(s); breadth@${k} (resample, K×M turns) vs depth@${k} (one loop, ${k * m} turns), conc=${concurrency}\n`) + + const rows = await pool(tasks, concurrency, async (task, i) => { + const server = task.servers[0] + if (!server) return null + try { + const ratio = (x: { passes: number; total: number }) => x.passes / Math.max(x.total, 1) + let acts = 0 + // breadth@K: K INDEPENDENT short loops on fresh DBs; keep the best verifier score. + const breadthScores: Array<{ passes: number; total: number; resolved: boolean }> = [] + for (let s = 0; s < k; s += 1) { + const dbId = await seedDb(server, dbsDir) + try { + const tools = await toolSpecs(server, dbId, task.selectedTools) + acts += await runShot(cfg, task, server, dbId, tools, m) + breadthScores.push(await score(server, dbId, task.verifiers)) + } finally { + await deleteDb(server, dbId) + } + } + const breadthBest = breadthScores.reduce((a, b) => (ratio(b) > ratio(a) ? b : a), breadthScores[0] ?? { passes: 0, total: 1, resolved: false }) + + // depth@K: K SEQUENTIAL shots over ONE persistent DB — the artifact accumulates and + // each re-engagement is steered to finish what's left (the regime where depth won + // before). Equal compute: K shots × M turns, same as breadth's K × M. + const depthDb = await seedDb(server, dbsDir) + let depth = { passes: 0, total: 1, resolved: false } + try { + const tools = await toolSpecs(server, depthDb, task.selectedTools) + for (let s = 0; s < k; s += 1) { + const steer = s === 0 ? undefined : 'Re-inspect the current state with the read tools, identify what the task still requires, and complete it. Do not stop until every required change is verified in place.' + acts += await runShot(cfg, task, server, depthDb, tools, m, steer) + } + depth = await score(server, depthDb, task.verifiers) + } finally { + await deleteDb(server, depthDb) + } + + process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} depth=${depth.passes}/${depth.total} toolcalls=${acts}\n`) + return { breadthR: ratio(breadthBest), depthR: ratio(depth), breadthRes: breadthBest.resolved ? 1 : 0, depthRes: depth.resolved ? 1 : 0 } + } catch (err) { + process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: SKIP (${err instanceof Error ? err.message.slice(0, 90) : String(err)})\n`) + return null + } + }) + + const ok = rows.filter((r): r is NonNullable => r !== null) + const excluded = rows.length - ok.length + const breadthR = ok.map((r) => r.breadthR) + const depthR = ok.map((r) => r.depthR) + const breadthRes = ok.map((r) => r.breadthRes) + const depthRes = ok.map((r) => r.depthRes) + const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1) + const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s. (CI spans 0)') + + console.log(`\n${'='.repeat(72)}`) + console.log(`RESULTS · EOPS itsm · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · ${model}`) + console.log('='.repeat(72)) + console.log(` verifier score (partial credit, the signal at this difficulty):`) + console.log(` breadth@${k} (resample) ${pct(rate(breadthR))} depth@${k} (steered) ${pct(rate(depthR))}`) + console.log(` fully-resolved rate (all verifiers):`) + console.log(` breadth@${k} ${pct(rate(breadthRes))} depth@${k} ${pct(rate(depthRes))}`) + const liftScore = pairedLift(breadthR, depthR) + const liftRes = pairedLift(breadthRes, depthRes) + console.log(`\n PAIRED LIFTS (95% bootstrap CI, B=10000):`) + console.log(` depth − breadth, SCORE ${pp(liftScore.point)} CI [${pp(liftScore.low)}, ${pp(liftScore.high)}] (paired ${liftScore.pairs}, disc ${liftScore.discordant}) ${sig(liftScore)}`) + console.log(` depth − breadth, RESOLVED ${pp(liftRes.point)} CI [${pp(liftRes.low)}, ${pp(liftRes.high)}] (paired ${liftRes.pairs}, disc ${liftRes.discordant}) ${sig(liftRes)}`) + console.log(`\n VERDICT: does steered depth beat blind breadth on this stateful agentic domain @ equal compute? ${liftScore.point > 0 ? 'yes (score)' : 'no'} (${sig(liftScore)})`) +} + +main().catch((err) => { + console.error(`eops-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`) + process.exit(1) +})