From 94c10e78be09609a575a8ca65aeb6d55fd571d38 Mon Sep 17 00:00:00 2001 From: Mike Edwards Date: Sun, 14 Jun 2026 13:36:28 -0400 Subject: [PATCH 1/5] fix: reconcile dead-pid jobs so status/result stop reporting zombies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A worker that dies before recording a terminal state (crash, SIGKILL, OOM, broker drop, session teardown) left its job stuck status:running forever — status showed a growing elapsed on a frozen updatedAt, and result threw 'still running' / 'no job found'. - job-control.mjs: every job-read path now routes through a liveness reconcile — process.kill(pid,0) -> ESRCH means the worker is gone, so the job is persisted as failed:orphaned. Idempotent; a live pid (or our own pid) is never touched. - tracked-jobs.mjs: the worker installs exit/SIGTERM/SIGINT/SIGHUP finalizers that flush a terminal failed:interrupted record on graceful death (SIGKILL/OOM are caught by the reconcile above). Defense in depth. - render.mjs: result falls back to the job's log tail when no payload was captured. - tests/job-control.test.mjs: dead-pid->orphaned, live-pid untouched, result resolves. 86 -> 89 tests, all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/codex/scripts/lib/job-control.mjs | 69 +++++++++++-- plugins/codex/scripts/lib/render.mjs | 25 +++++ plugins/codex/scripts/lib/tracked-jobs.mjs | 48 +++++++++ tests/job-control.test.mjs | 112 +++++++++++++++++++++ 4 files changed, 248 insertions(+), 6 deletions(-) create mode 100644 tests/job-control.test.mjs diff --git a/plugins/codex/scripts/lib/job-control.mjs b/plugins/codex/scripts/lib/job-control.mjs index ad152c15..53086333 100644 --- a/plugins/codex/scripts/lib/job-control.mjs +++ b/plugins/codex/scripts/lib/job-control.mjs @@ -1,8 +1,8 @@ import fs from "node:fs"; import { getSessionRuntimeStatus } from "./codex.mjs"; -import { getConfig, listJobs, readJobFile, resolveJobFile } from "./state.mjs"; -import { SESSION_ID_ENV } from "./tracked-jobs.mjs"; +import { getConfig, listJobs, readJobFile, resolveJobFile, upsertJob, writeJobFile } from "./state.mjs"; +import { nowIso, SESSION_ID_ENV } from "./tracked-jobs.mjs"; import { resolveWorkspaceRoot } from "./workspace.mjs"; export const DEFAULT_MAX_STATUS_JOBS = 8; @@ -210,10 +210,65 @@ function matchJobReference(jobs, reference, predicate = () => true) { throw new Error(`No job found for "${reference}". Run /codex:status to list known jobs.`); } +const ACTIVE_STATUSES = new Set(["queued", "running"]); + +// Liveness probe: signal 0 is delivered to a live pid, throws ESRCH for a dead one. +// EPERM means the process exists but we may not signal it — still alive. +function isPidAlive(pid) { + if (!Number.isFinite(pid)) { + return false; + } + try { + process.kill(pid, 0); + return true; + } catch (error) { + return error?.code === "EPERM"; + } +} + +// A job whose worker pid is dead while its stored status is still active is an ORPHAN: +// the worker died (crash, SIGKILL, OOM, broker drop, parent teardown) without recording +// a terminal state, so status/result would report `running` forever. Reconcile it to a +// durable `failed:orphaned` record. Idempotent — once reconciled, status is no longer +// active, so later reads skip it. This is what makes status/result honest about liveness. +function reconcileJob(workspaceRoot, job) { + if (!job || !ACTIVE_STATUSES.has(job.status) || !Number.isFinite(job.pid)) { + return job; + } + if (job.pid === process.pid || isPidAlive(job.pid)) { + return job; + } + const patch = { + id: job.id, + status: "failed", + phase: "failed:orphaned", + pid: null, + completedAt: nowIso(), + errorMessage: + "Worker process exited without recording completion (orphaned — reconciled by a status/result check)." + }; + try { + upsertJob(workspaceRoot, patch); + const jobFile = resolveJobFile(workspaceRoot, job.id); + if (fs.existsSync(jobFile)) { + writeJobFile(workspaceRoot, job.id, { ...readJobFile(jobFile), ...patch }); + } + } catch { + // Best-effort persistence; still return the reconciled view so the caller stays honest. + } + return { ...job, ...patch }; +} + +// Reconciling read: the single source of truth for "what is actually alive". Every +// job-reading resolver routes through this so a dead worker can never masquerade as running. +function listJobsReconciled(workspaceRoot) { + return listJobs(workspaceRoot).map((job) => reconcileJob(workspaceRoot, job)); +} + export function buildStatusSnapshot(cwd, options = {}) { const workspaceRoot = resolveWorkspaceRoot(cwd); const config = getConfig(workspaceRoot); - const jobs = sortJobsNewestFirst(filterJobsForCurrentSession(listJobs(workspaceRoot), options)); + const jobs = sortJobsNewestFirst(filterJobsForCurrentSession(listJobsReconciled(workspaceRoot), options)); const maxJobs = options.maxJobs ?? DEFAULT_MAX_STATUS_JOBS; const maxProgressLines = options.maxProgressLines ?? DEFAULT_MAX_PROGRESS_LINES; @@ -241,7 +296,7 @@ export function buildStatusSnapshot(cwd, options = {}) { export function buildSingleJobSnapshot(cwd, reference, options = {}) { const workspaceRoot = resolveWorkspaceRoot(cwd); - const jobs = sortJobsNewestFirst(listJobs(workspaceRoot)); + const jobs = sortJobsNewestFirst(listJobsReconciled(workspaceRoot)); const selected = matchJobReference(jobs, reference); if (!selected) { throw new Error(`No job found for "${reference}". Run /codex:status to inspect known jobs.`); @@ -255,7 +310,9 @@ export function buildSingleJobSnapshot(cwd, reference, options = {}) { export function resolveResultJob(cwd, reference) { const workspaceRoot = resolveWorkspaceRoot(cwd); - const jobs = sortJobsNewestFirst(reference ? listJobs(workspaceRoot) : filterJobsForCurrentSession(listJobs(workspaceRoot))); + const jobs = sortJobsNewestFirst( + reference ? listJobsReconciled(workspaceRoot) : filterJobsForCurrentSession(listJobsReconciled(workspaceRoot)) + ); const selected = matchJobReference( jobs, reference, @@ -280,7 +337,7 @@ export function resolveResultJob(cwd, reference) { export function resolveCancelableJob(cwd, reference, options = {}) { const workspaceRoot = resolveWorkspaceRoot(cwd); - const jobs = sortJobsNewestFirst(listJobs(workspaceRoot)); + const jobs = sortJobsNewestFirst(listJobsReconciled(workspaceRoot)); const activeJobs = jobs.filter((job) => job.status === "queued" || job.status === "running"); if (reference) { diff --git a/plugins/codex/scripts/lib/render.mjs b/plugins/codex/scripts/lib/render.mjs index 2ec18523..c84bb9c3 100644 --- a/plugins/codex/scripts/lib/render.mjs +++ b/plugins/codex/scripts/lib/render.mjs @@ -1,3 +1,23 @@ +import fs from "node:fs"; + +// When a job has no captured result payload (e.g. it died before producing one and was +// reconciled to failed:orphaned), surface the tail of its log so the user still gets the +// real work / partial output instead of an empty "No result" message. +function readLogTail(logFile, maxLines = 40) { + if (!logFile || !fs.existsSync(logFile)) { + return null; + } + try { + const lines = fs + .readFileSync(logFile, "utf8") + .split(/\r?\n/) + .filter((line) => line.length > 0); + return lines.length ? lines.slice(-maxLines).join("\n") : null; + } catch { + return null; + } +} + function severityRank(severity) { switch (severity) { case "critical": @@ -442,6 +462,11 @@ export function renderStoredJobResult(job, storedJob) { lines.push("", "No captured result payload was stored for this job."); } + const logTail = readLogTail(job.logFile ?? storedJob?.logFile); + if (logTail) { + lines.push("", "--- last log lines (no result payload was captured) ---", logTail); + } + return `${lines.join("\n").trimEnd()}\n`; } diff --git a/plugins/codex/scripts/lib/tracked-jobs.mjs b/plugins/codex/scripts/lib/tracked-jobs.mjs index 90286901..9318ec73 100644 --- a/plugins/codex/scripts/lib/tracked-jobs.mjs +++ b/plugins/codex/scripts/lib/tracked-jobs.mjs @@ -151,6 +151,52 @@ export async function runTrackedJob(job, runner, options = {}) { writeJobFile(job.workspaceRoot, job.id, runningRecord); upsertJob(job.workspaceRoot, runningRecord); + // Crash safety: if this worker dies before recording a terminal state — SIGTERM/SIGINT/ + // SIGHUP, an uncaught error, or a plain process exit while still "running" — flush a + // terminal `failed:interrupted` record so status/result never see a permanent "running" + // ghost. (SIGKILL/OOM cannot be trapped here; those are caught by the pid-liveness + // reconcile in job-control.mjs. Defense in depth.) + let settled = false; + const finalizeInterrupted = () => { + if (settled) { + return; + } + const existing = readStoredJobOrNull(job.workspaceRoot, job.id); + if (!existing || (existing.status !== "running" && existing.status !== "queued")) { + return; + } + settled = true; + const patch = { + status: "failed", + phase: "failed:interrupted", + pid: null, + completedAt: nowIso(), + errorMessage: "Worker process terminated before completion." + }; + try { + writeJobFile(job.workspaceRoot, job.id, { ...existing, ...patch }); + upsertJob(job.workspaceRoot, { id: job.id, ...patch }); + } catch { + // best-effort + } + }; + const onSignal = (signal) => { + finalizeInterrupted(); + process.exit(signal === "SIGINT" ? 130 : 143); + }; + const trappedSignals = ["SIGTERM", "SIGINT", "SIGHUP"]; + process.on("exit", finalizeInterrupted); + for (const signal of trappedSignals) { + process.on(signal, onSignal); + } + const disarm = () => { + settled = true; + process.removeListener("exit", finalizeInterrupted); + for (const signal of trappedSignals) { + process.removeListener(signal, onSignal); + } + }; + try { const execution = await runner(); const completionStatus = execution.exitStatus === 0 ? "completed" : "failed"; @@ -200,5 +246,7 @@ export async function runTrackedJob(job, runner, options = {}) { completedAt }); throw error; + } finally { + disarm(); } } diff --git a/tests/job-control.test.mjs b/tests/job-control.test.mjs new file mode 100644 index 00000000..ecec0878 --- /dev/null +++ b/tests/job-control.test.mjs @@ -0,0 +1,112 @@ +import fs from "node:fs"; +import { spawn } from "node:child_process"; +import test from "node:test"; +import assert from "node:assert/strict"; + +import { makeTempDir } from "./helpers.mjs"; +import { resolveJobLogFile, upsertJob, writeJobFile } from "../plugins/codex/scripts/lib/state.mjs"; +import { buildStatusSnapshot, resolveResultJob } from "../plugins/codex/scripts/lib/job-control.mjs"; + +// These cover the liveness-reconcile fix: a job whose worker pid is dead but whose stored +// status is still active must not masquerade as `running` forever (the zombie bug). +function withWorkspace(fn) { + const prevData = process.env.CLAUDE_PLUGIN_DATA; + const prevSession = process.env.CODEX_COMPANION_SESSION_ID; + process.env.CLAUDE_PLUGIN_DATA = makeTempDir(); + delete process.env.CODEX_COMPANION_SESSION_ID; // avoid session-scoped filtering in tests + const workspace = makeTempDir(); + try { + return fn(workspace); + } finally { + if (prevData === undefined) { + delete process.env.CLAUDE_PLUGIN_DATA; + } else { + process.env.CLAUDE_PLUGIN_DATA = prevData; + } + if (prevSession === undefined) { + delete process.env.CODEX_COMPANION_SESSION_ID; + } else { + process.env.CODEX_COMPANION_SESSION_ID = prevSession; + } + } +} + +const DEAD_PID = 999999; // beyond the macOS/Linux pid range — guaranteed not running + +test("status reconciles a running job with a dead worker pid to failed:orphaned", () => { + withWorkspace((workspace) => { + const logFile = resolveJobLogFile(workspace, "task-dead"); + fs.writeFileSync(logFile, "[t] Running command: pytest\n[t] partial work captured\n"); + const record = { + id: "task-dead", + status: "running", + pid: DEAD_PID, + startedAt: new Date().toISOString(), + logFile, + jobClass: "task" + }; + upsertJob(workspace, record); + writeJobFile(workspace, "task-dead", record); + + const snapshot = buildStatusSnapshot(workspace); + assert.equal( + snapshot.running.some((job) => job.id === "task-dead"), + false, + "a dead-pid job must not remain in running[]" + ); + const finished = [snapshot.latestFinished, ...snapshot.recent].filter(Boolean); + const dead = finished.find((job) => job.id === "task-dead"); + assert.ok(dead, "the dead job must be reconciled into a terminal bucket"); + assert.equal(dead.status, "failed"); + assert.equal(dead.phase, "failed:orphaned"); + + // Idempotent: a second read must not error or flip it back to running. + const second = buildStatusSnapshot(workspace); + assert.equal(second.running.some((job) => job.id === "task-dead"), false); + }); +}); + +test("status leaves a running job with a live worker pid untouched", () => { + withWorkspace((workspace) => { + const child = spawn("sleep", ["10"]); + try { + upsertJob(workspace, { + id: "task-live", + status: "running", + pid: child.pid, + startedAt: new Date().toISOString(), + jobClass: "task" + }); + const snapshot = buildStatusSnapshot(workspace); + assert.equal( + snapshot.running.some((job) => job.id === "task-live" && job.status === "running"), + true, + "a live-pid job must stay running" + ); + } finally { + child.kill(); + } + }); +}); + +test("result resolves a reconciled orphan instead of throwing 'still running'", () => { + withWorkspace((workspace) => { + const logFile = resolveJobLogFile(workspace, "task-orphan"); + fs.writeFileSync(logFile, "[t] FINDING: example at foo.js:12\n"); + const record = { + id: "task-orphan", + status: "running", + pid: DEAD_PID, + startedAt: new Date().toISOString(), + logFile, + jobClass: "task", + title: "Codex Task" + }; + upsertJob(workspace, record); + writeJobFile(workspace, "task-orphan", record); + + const { job } = resolveResultJob(workspace, "task-orphan"); + assert.equal(job.status, "failed"); + assert.equal(job.phase, "failed:orphaned"); + }); +}); From ee34bb6ecca457b98a3f10b5912a51a4802b8ee4 Mon Sep 17 00:00:00 2001 From: Mike Edwards Date: Sun, 14 Jun 2026 14:12:23 -0400 Subject: [PATCH 2/5] harden(codex): wall-clock turn timeout so a hung turn can't zombie the worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A single app-server turn that never reports completion would await forever. The liveness-reconcile in job-control only reaps DEAD workers — a turn that hangs while the pid stays alive slips past it. captureTurn now races state.completion against a wall-clock deadline (CODEX_TURN_TIMEOUT_MS, default 45m, 0 disables); on expiry it best-effort turn/interrupts the stuck turn and throws, so the job finalizes failed instead of hanging. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/codex/scripts/lib/codex.mjs | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/plugins/codex/scripts/lib/codex.mjs b/plugins/codex/scripts/lib/codex.mjs index f2fe88bd..a2b9006f 100644 --- a/plugins/codex/scripts/lib/codex.mjs +++ b/plugins/codex/scripts/lib/codex.mjs @@ -550,6 +550,23 @@ function applyTurnNotification(state, message) { } } +const DEFAULT_TURN_TIMEOUT_MS = 45 * 60 * 1000; + +// Wall-clock ceiling for a single app-server turn. A turn that never reports completion would +// hang the worker forever — and because the worker pid stays alive, the liveness-reconcile in +// job-control cannot catch it (that only reaps DEAD workers). CODEX_TURN_TIMEOUT_MS overrides; 0 disables. +function resolveTurnTimeoutMs(options = {}) { + const raw = options.turnTimeoutMs ?? process.env.CODEX_TURN_TIMEOUT_MS; + if (raw === undefined || raw === null || raw === "") { + return DEFAULT_TURN_TIMEOUT_MS; + } + const value = Number(raw); + if (!Number.isFinite(value) || value < 0) { + return DEFAULT_TURN_TIMEOUT_MS; + } + return value; +} + async function captureTurn(client, threadId, startRequest, options = {}) { const state = createTurnCaptureState(threadId, options); const previousHandler = client.notificationHandler; @@ -597,6 +614,40 @@ async function captureTurn(client, threadId, startRequest, options = {}) { completeTurn(state, response.turn); } + const turnTimeoutMs = resolveTurnTimeoutMs(options); + if (turnTimeoutMs > 0) { + let timeoutTimer = null; + const timeout = new Promise((_, reject) => { + timeoutTimer = setTimeout(() => { + timeoutTimer = null; + reject( + new Error( + `Codex turn exceeded ${Math.round(turnTimeoutMs / 1000)}s without completing — interrupted and marked failed (set CODEX_TURN_TIMEOUT_MS=0 to disable).` + ) + ); + }, turnTimeoutMs); + timeoutTimer.unref?.(); + }); + try { + return await Promise.race([state.completion, timeout]); + } catch (error) { + if (state.turnId) { + // Best-effort: tell the app server to stop the stuck turn before we surface the failure. + try { + await client.request("turn/interrupt", { + threadId: state.threadId, + turnId: state.turnId + }); + } catch {} + } + throw error; + } finally { + if (timeoutTimer) { + clearTimeout(timeoutTimer); + } + } + } + return await state.completion; } finally { clearCompletionTimer(state); From b77d6dc8175ec6c5bf295610043a8882c5122449 Mon Sep 17 00:00:00 2001 From: Mike Edwards Date: Sun, 14 Jun 2026 14:12:23 -0400 Subject: [PATCH 3/5] =?UTF-8?q?feat(codex):=20design-critique=20+=20house?= =?UTF-8?q?=20review=20overlay=20=E2=80=94=20the=20two=20second-family=20j?= =?UTF-8?q?obs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex is used as a second model family for exactly two jobs; make both first-class and "like us": (a) Design critique — /codex:critique + codex-critique subagent + handleCritique run the new design-critique prompt READ-ONLY, with a hard mandate to ground every load-bearing claim in the code (file:line) AND the LIVE database (cite the query + result) — not prose-only. (b) Code review — a prompt-overlay seam (loadPromptTemplate appends prompts/overlays/.md inside ) carries our adversarial-review rubric ON TOP of the upstream prompt, so "git pull upstream" never conflicts on the scaffold. The rubric: contradict don't echo, correctness-first, file:line, demand check.sh proof, flag scope creep. Also makes the brittle exact-list command test tolerant of fork additions, and adds critique-and-overlays.test.mjs to lock in both jobs + the timeout. 96/96 green. Co-Authored-By: Claude Opus 4.8 (1M context) --- plugins/codex/agents/codex-critique.md | 33 +++++++ plugins/codex/commands/critique.md | 23 +++++ plugins/codex/prompts/design-critique.md | 49 ++++++++++ .../prompts/overlays/adversarial-review.md | 16 ++++ plugins/codex/scripts/codex-companion.mjs | 64 +++++++++++++ plugins/codex/scripts/lib/prompts.mjs | 14 ++- tests/commands.test.mjs | 11 ++- tests/critique-and-overlays.test.mjs | 96 +++++++++++++++++++ 8 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 plugins/codex/agents/codex-critique.md create mode 100644 plugins/codex/commands/critique.md create mode 100644 plugins/codex/prompts/design-critique.md create mode 100644 plugins/codex/prompts/overlays/adversarial-review.md create mode 100644 tests/critique-and-overlays.test.mjs diff --git a/plugins/codex/agents/codex-critique.md b/plugins/codex/agents/codex-critique.md new file mode 100644 index 00000000..d9bbd8ed --- /dev/null +++ b/plugins/codex/agents/codex-critique.md @@ -0,0 +1,33 @@ +--- +name: codex-critique +description: Use to hand a DESIGN (not a finished diff) to Codex for an independent, code- and data-grounded critique. Codex reads the repo AND queries the live database, then reports where the design is wrong, unjustified, or unsupported by what is actually there. +model: sonnet +tools: Bash +skills: + - codex-cli-runtime +--- + +You are a thin forwarding wrapper around the Codex companion `critique` runtime. + +Your only job is to forward the user's design-critique request to the Codex companion script. Do not do anything else. + +Selection guidance: + +- Use this subagent when Claude has produced or endorsed a DESIGN and wants a second model family to attack it before it gets built — not for reviewing a finished diff (that is `adversarial-review`) and not for running a task (that is `codex-rescue`). +- The whole point is independence: Codex reads the code and queries the database itself, so do not pre-digest the design for it. + +Forwarding rules: + +- Use exactly one `Bash` call to invoke `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" critique ...`. +- The `critique` run is READ-ONLY by design (a critique never edits). Never add `--write`. +- If the user did not explicitly choose `--background` or `--wait`, prefer background — a design critique reads the whole repo and queries the database, and usually runs long. +- Pass the user's design reference through unchanged: the design text itself as the positional argument, or `--prompt-file ` when they point at a design-doc file. +- If the user supplies `--focus ""`, forward it as `--focus ""` — it steers what the critique scrutinizes. +- Treat `--effort ` and `--model ` as runtime controls and forward them; do not include them in the design text. If the user asks for `spark`, map it to `--model gpt-5.3-codex-spark`. +- Do not reshape, summarize, or pre-analyze the design yourself. Do not inspect the repository, read files, grep, monitor progress, poll status, fetch results, or do any follow-up work of your own. The design-critique prompt and Codex do the analysis. +- Return the stdout of the `codex-companion` command exactly as-is. +- If the Bash call fails or Codex cannot be invoked, return nothing. + +Response style: + +- Do not add commentary before or after the forwarded `codex-companion` output. diff --git a/plugins/codex/commands/critique.md b/plugins/codex/commands/critique.md new file mode 100644 index 00000000..3e5ce6fc --- /dev/null +++ b/plugins/codex/commands/critique.md @@ -0,0 +1,23 @@ +--- +description: Hand a DESIGN to Codex (a second model family) for an independent critique with full read access to the code AND the live database +argument-hint: "[--background|--wait] [--model ] [--effort ] [--focus \"what to scrutinize\"] " +allowed-tools: Bash(node:*), Agent +--- + +Invoke the `codex:codex-critique` subagent via the `Agent` tool (`subagent_type: "codex:codex-critique"`), forwarding the raw user request as the prompt. +`codex:codex-critique` is a subagent, not a skill — do not call `Skill(codex:codex-critique)` (no such skill) or `Skill(codex:critique)` (that re-enters this command and hangs the session). The command runs inline so the `Agent` tool stays in scope; forked general-purpose subagents do not expose it. +The final user-visible response must be Codex's output verbatim. + +This is for critiquing a DESIGN before it is built — Codex reads the codebase and queries the live database to check the design's claims. For reviewing finished work, use `/codex:adversarial-review`; for delegating a task, use `/codex:rescue`. + +Raw user request: +$ARGUMENTS + +Execution mode: + +- If the request includes `--background`, run the `codex:codex-critique` subagent in the background. +- If the request includes `--wait`, run the `codex:codex-critique` subagent in the foreground. +- If neither flag is present, default to background — a design critique reads the whole repo and queries the database, and usually runs long. +- `--background` and `--wait` are execution flags for Claude Code. Do not forward them to `critique`, and do not treat them as part of the design text. +- `--model`, `--effort`, and `--focus` are runtime-selection flags. Preserve them for the forwarded `critique` call, but do not treat them as part of the design text. +- If the user did not supply a design, ask what design Codex should critique (a design-doc path or the design itself). diff --git a/plugins/codex/prompts/design-critique.md b/plugins/codex/prompts/design-critique.md new file mode 100644 index 00000000..97e84ebd --- /dev/null +++ b/plugins/codex/prompts/design-critique.md @@ -0,0 +1,49 @@ + +You are Codex, a SECOND model family, critiquing a DESIGN — not reviewing a finished diff. +A different model (Claude) produced or endorsed this design. Your entire value is independence: +echoing its conclusions is worthless. Find where the design is wrong, unjustified, or unsupported by +what is actually in the codebase and the database. + + + +You have full read access to the repository AND, where a database / MCP tool is available, the LIVE data. +USE BOTH. A design critique that only reasons about the prose is half a critique. +- Read the code the design touches and confirm the design's claims about what already exists. +- Query the database (e.g. `scripts/sql.sh "SELECT ..."` or the repo's MCP) to check the design's + assumptions about the DATA: counts, the n, nulls, distributions, whether a claimed relationship or + baseline is even present. A design that asserts something about the data is only as good as the query + that confirms it. Cite the query and its result in your finding. + + + +For each load-bearing claim or decision in the design: +1. State the claim in one line. +2. Verify it against the code (`file:line`) and/or the data (the query + its result). +3. If it holds, say so in one line and move on. If it does NOT hold, that is your highest-value finding — + lead with it. +Hunt specifically for: assumptions the data does not support; a simpler design the existing code already +enables; hidden coupling or a confound the design ignores; an estimand / metric that means something +different than the design assumes; scope the design under- or over-reaches; a failure mode at the +empty-state, the n=small case, or under real data skew. + + + +Default to skepticism. Do not credit good intent, partial coverage, or likely follow-up. Ground every +point in a row, a query result, or a `file:line` — never a vibe. Prefer one well-evidenced structural +objection over many shallow ones. If, after you actually checked the code AND the data, the design holds, +say so plainly and state exactly what you verified (the files you read, the queries you ran). + + + +Lead with a one-line verdict: does this design hold up, or not, and the single biggest reason. Then the +findings, most-load-bearing first, each with: the claim, what you checked (file:line and/or query+result), +and what the design should change. Be terse. Surface what is missing, not just what is wrong. + + + +{{DESIGN_INPUT}} + + + +{{USER_FOCUS}} + diff --git a/plugins/codex/prompts/overlays/adversarial-review.md b/plugins/codex/prompts/overlays/adversarial-review.md new file mode 100644 index 00000000..8ab171f2 --- /dev/null +++ b/plugins/codex/prompts/overlays/adversarial-review.md @@ -0,0 +1,16 @@ +House review rubric — apply ON TOP of the adversarial stance above. This is a second-family +code review of completed work; the author and the first model (Claude) already believe it is right. + +- You are a SECOND model family. Your value is to CONTRADICT, not echo. Hunt for what the author and + the first model would rubber-stamp. If you only agree, you added nothing — find the thing they missed. +- Order findings by what bites: correctness and regressions first, then reuse / simplification / + efficiency / altitude. Style or naming only when it causes a real bug. +- Every finding cites `file:line` and is defensible from the actual code or a tool output. Never invent + a path, a line, or runtime behavior you cannot support. +- Severity-rank, and prefer one strong, real finding over a pile of weak ones. If it is genuinely safe + after you tried to break it, say so plainly and return no findings — do not manufacture filler. +- Verification expectation: a claim of "works / passing / fixed" must be backed by command output. Flag + any change that edits code but does not run the repo's own check (e.g. `scripts/check.sh`) or whose + tests do not actually exercise the changed behavior. +- Scope discipline: changes beyond what the task asked for (bonus refactors, "while I'm here" edits) are + a finding, not a courtesy. diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 35222fd5..609d74ef 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -792,6 +792,67 @@ async function handleTask(argv) { ); } +// Design critique (the second-family, code+DB-grounded critique of a DESIGN — not a diff). +// Same task engine as handleTask, but the prompt wraps the user's design in the design-critique +// template and the run is READ-ONLY (a critique never edits). The design reference is read the +// same way a task prompt is (positional text or --prompt-file). +async function handleCritique(argv) { + const { options, positionals } = parseCommandInput(argv, { + valueOptions: ["model", "effort", "cwd", "prompt-file", "focus"], + booleanOptions: ["json", "background"], + aliasMap: { + m: "model" + } + }); + + const cwd = resolveCommandCwd(options); + const workspaceRoot = resolveCommandWorkspace(options); + const model = normalizeRequestedModel(options.model); + const effort = normalizeReasoningEffort(options.effort); + const designInput = readTaskPrompt(cwd, options, positionals); + requireTaskRequest(designInput, false); + + const prompt = interpolateTemplate(loadPromptTemplate(ROOT_DIR, "design-critique"), { + DESIGN_INPUT: designInput, + USER_FOCUS: typeof options.focus === "string" ? options.focus : "" + }); + const taskMetadata = buildTaskRunMetadata({ prompt, resumeLast: false }); + + if (options.background) { + ensureCodexAvailable(cwd); + const job = buildTaskJob(workspaceRoot, taskMetadata, false); + const request = buildTaskRequest({ + cwd, + model, + effort, + prompt, + write: false, + resumeLast: false, + jobId: job.id + }); + const { payload } = enqueueBackgroundTask(cwd, job, request); + outputCommandResult(payload, renderQueuedTaskLaunch(payload), options.json); + return; + } + + const job = buildTaskJob(workspaceRoot, taskMetadata, false); + await runForegroundCommand( + job, + (progress) => + executeTaskRun({ + cwd, + model, + effort, + prompt, + write: false, + resumeLast: false, + jobId: job.id, + onProgress: progress + }), + { json: options.json } + ); +} + async function handleTaskWorker(argv) { const { options } = parseCommandInput(argv, { valueOptions: ["cwd", "job-id"] @@ -1000,6 +1061,9 @@ async function main() { case "task": await handleTask(argv); break; + case "critique": + await handleCritique(argv); + break; case "task-worker": await handleTaskWorker(argv); break; diff --git a/plugins/codex/scripts/lib/prompts.mjs b/plugins/codex/scripts/lib/prompts.mjs index 20108150..4a9026bd 100644 --- a/plugins/codex/scripts/lib/prompts.mjs +++ b/plugins/codex/scripts/lib/prompts.mjs @@ -3,7 +3,19 @@ import path from "node:path"; export function loadPromptTemplate(rootDir, name) { const promptPath = path.join(rootDir, "prompts", `${name}.md`); - return fs.readFileSync(promptPath, "utf8"); + let template = fs.readFileSync(promptPath, "utf8"); + // House overlay (ours, not upstream's): if prompts/overlays/.md exists, append it so + // our review/critique conventions ride ON TOP of the upstream prompt without editing it — + // `git pull upstream` never conflicts on the prompt scaffold; only this loader is a small, + // stable fork change. The overlay carries no {{TOKENS}}, so interpolation leaves it intact. + const overlayPath = path.join(rootDir, "prompts", "overlays", `${name}.md`); + if (fs.existsSync(overlayPath)) { + const overlay = fs.readFileSync(overlayPath, "utf8").trim(); + if (overlay) { + template += `\n\n\n${overlay}\n\n`; + } + } + return template; } export function interpolateTemplate(template, variables) { diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 3724ffa4..653f484a 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -72,7 +72,12 @@ test("adversarial review command uses AskUserQuestion and background Bash while test("continue is not exposed as a user-facing command", () => { const commandFiles = fs.readdirSync(path.join(PLUGIN_ROOT, "commands")).sort(); - assert.deepEqual(commandFiles, [ + // The real invariant: `continue` is folded into `rescue`, never its own command. + assert.equal(commandFiles.includes("continue.md"), false); + // The upstream command set must all still be present. This fork may ADD commands + // (e.g. critique.md) on top — that must not break this guard, and the guard must + // not have to be re-edited on every upstream sync. + for (const expected of [ "adversarial-review.md", "cancel.md", "rescue.md", @@ -80,7 +85,9 @@ test("continue is not exposed as a user-facing command", () => { "review.md", "setup.md", "status.md" - ]); + ]) { + assert.equal(commandFiles.includes(expected), true, `missing command: ${expected}`); + } }); test("rescue command absorbs continue semantics", () => { diff --git a/tests/critique-and-overlays.test.mjs b/tests/critique-and-overlays.test.mjs new file mode 100644 index 00000000..e5ae295a --- /dev/null +++ b/tests/critique-and-overlays.test.mjs @@ -0,0 +1,96 @@ +import fs from "node:fs"; +import path from "node:path"; +import test from "node:test"; +import assert from "node:assert/strict"; +import { fileURLToPath } from "node:url"; + +import { loadPromptTemplate } from "../plugins/codex/scripts/lib/prompts.mjs"; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), ".."); +const PLUGIN_ROOT = path.join(ROOT, "plugins", "codex"); + +function read(relativePath) { + return fs.readFileSync(path.join(PLUGIN_ROOT, relativePath), "utf8"); +} + +// Job (a): design critique. Codex is a second model family used to critique a DESIGN with full +// read access to the code AND the live database — before the design is built. +test("critique command forwards to the codex-critique subagent and stays read-only", () => { + const command = read("commands/critique.md"); + assert.match(command, /subagent_type: "codex:codex-critique"/); + assert.match(command, /do not call `Skill\(codex:codex-critique\)`/i); + assert.match(command, /allowed-tools:\s*Bash\(node:\*\),\s*Agent/); + assert.match(command, /critiquing a DESIGN before it is built/i); + assert.match(command, /default to background/i); + assert.match(command, /--focus/); + assert.match(command, /The final user-visible response must be Codex's output verbatim/i); +}); + +test("codex-critique agent is a thin forwarder to `critique` that never writes", () => { + const agent = read("agents/codex-critique.md"); + assert.match(agent, /thin forwarding wrapper/i); + assert.match(agent, /codex-companion\.mjs" critique \.\.\./); + assert.match(agent, /READ-ONLY by design/i); + assert.match(agent, /Never add `--write`/i); + // It must NOT do its own analysis — independence is the whole point. + assert.match(agent, /Do not reshape, summarize, or pre-analyze the design yourself/i); + assert.match(agent, /Return the stdout of the `codex-companion` command exactly as-is/i); + assert.match(agent, /codex-cli-runtime/); +}); + +test("the design-critique prompt mandates grounding in BOTH the code and the live database", () => { + const prompt = read("prompts/design-critique.md"); + assert.match(prompt, /SECOND model family/); + assert.match(prompt, /echoing its conclusions is worthless/i); + // The database-grounding mandate is the thing that makes this critique better than prose-only. + assert.match(prompt, /LIVE data/i); + assert.match(prompt, /Query the database/i); + assert.match(prompt, /Cite the query and its result/i); + assert.match(prompt, /file:line/); + assert.match(prompt, /\{\{DESIGN_INPUT\}\}/); + assert.match(prompt, /\{\{USER_FOCUS\}\}/); +}); + +test("the companion exposes a read-only `critique` entrypoint", () => { + const companion = read("scripts/codex-companion.mjs"); + assert.match(companion, /case "critique":/); + assert.match(companion, /async function handleCritique\(argv\)/); + assert.match(companion, /loadPromptTemplate\(ROOT_DIR, "design-critique"\)/); + // handleCritique must run write:false (a critique never edits). + assert.doesNotMatch(companion, /handleCritique[\s\S]*?write:\s*true/); +}); + +// The overlay seam: house conventions ride ON TOP of upstream prompts so `git pull upstream` +// never conflicts on the prompt scaffold. This is the durable "make it like us" mechanism. +test("the prompt overlay seam appends house conventions only when an overlay exists", () => { + // adversarial-review HAS an overlay -> house_conventions block is appended. + const reviewed = loadPromptTemplate(PLUGIN_ROOT, "adversarial-review"); + assert.match(reviewed, //); + assert.match(reviewed, /SECOND model family/); + assert.match(reviewed, /CONTRADICT, not echo/i); + + // design-critique has NO overlay -> no house_conventions wrapper is added. + const critique = loadPromptTemplate(PLUGIN_ROOT, "design-critique"); + assert.doesNotMatch(critique, //); +}); + +// Job (b): code review on completed work. The house review rubric is carried by the overlay. +test("the adversarial-review overlay carries the house code-review rubric", () => { + const overlay = read("prompts/overlays/adversarial-review.md"); + assert.match(overlay, /SECOND model family/); + assert.match(overlay, /CONTRADICT, not echo/i); + assert.match(overlay, /correctness and regressions first/i); + assert.match(overlay, /file:line/); + assert.match(overlay, /scripts\/check\.sh/); + assert.match(overlay, /Scope discipline/i); +}); + +// Robustness: a turn that never completes would hang the worker forever (pid stays alive, so the +// liveness-reconcile can't catch it). A wall-clock turn timeout interrupts and fails it instead. +test("captureTurn has a wall-clock turn timeout that interrupts the stuck turn", () => { + const codex = read("scripts/lib/codex.mjs"); + assert.match(codex, /CODEX_TURN_TIMEOUT_MS/); + assert.match(codex, /function resolveTurnTimeoutMs/); + assert.match(codex, /Promise\.race\(\[state\.completion, timeout\]\)/); + assert.match(codex, /turn\/interrupt/); +}); From 8fe24679520f46a6e8b7a7dc4eb6add349885444 Mon Sep 17 00:00:00 2001 From: Mike Edwards Date: Sun, 14 Jun 2026 14:39:52 -0400 Subject: [PATCH 4/5] retire(codex): /codex:review folds into adversarial-review; document the two-job model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex is a second model family for exactly two jobs — critique a design before building it (/codex:critique), and adversarially review completed work (/codex:adversarial-review). The plain /codex:review command was a strict subset of adversarial-review (same engine, same git targets, just no focus text and no house rubric), so it only created "which review?" confusion. - Remove the /codex:review command (the user-facing path). The companion `review` engine stays — the optional Stop review-gate uses it internally (runtime tests still cover it). - README: drop the review section + refs, add a /codex:critique section, frame the two-job model up front. adversarial-review is now "the one review command". - Skill routing + result.md follow-ups point at adversarial-review / critique. - Tests updated to match. 95/95 green. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 49 ++++++++------- plugins/codex/commands/adversarial-review.md | 4 +- plugins/codex/commands/result.md | 2 +- plugins/codex/commands/review.md | 61 ------------------- .../codex/skills/gpt-5-4-prompting/SKILL.md | 2 +- tests/commands.test.mjs | 36 +---------- 6 files changed, 31 insertions(+), 123 deletions(-) delete mode 100644 plugins/codex/commands/review.md diff --git a/README.md b/README.md index 458c39fb..1aa9ac90 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Codex plugin for Claude Code -Use Codex from inside Claude Code for code reviews or to delegate tasks to Codex. +Use Codex from inside Claude Code as a second model family — to critique a design before you build it, adversarially review completed work, or delegate a task to Codex. This plugin is for Claude Code users who want an easy way to start using Codex from the workflow they already have. @@ -9,9 +9,11 @@ they already have. ## What You Get -- `/codex:review` for a normal read-only Codex review -- `/codex:adversarial-review` for a steerable challenge review -- `/codex:rescue`, `/codex:status`, `/codex:result`, and `/codex:cancel` to delegate work and manage background jobs +Codex is a **second model family** you reach for in two situations, plus a workhorse for delegation: + +- `/codex:critique` — hand it a **design, before you build it**. Codex reads the code *and* queries the live database, then attacks the design where the data doesn't back it up. +- `/codex:adversarial-review` — an adversarial review of **completed work**. The one review command: it challenges the approach, not just the diff. +- `/codex:rescue`, `/codex:status`, `/codex:result`, and `/codex:cancel` — delegate a task to Codex and manage background jobs. ## Requirements @@ -62,41 +64,32 @@ If Codex is installed but not logged in yet, run: After install, you should see: - the slash commands listed below -- the `codex:codex-rescue` subagent in `/agents` +- the `codex:codex-rescue` and `codex:codex-critique` subagents in `/agents` One simple first run is: ```bash -/codex:review --background +/codex:adversarial-review --background /codex:status /codex:result ``` ## Usage -### `/codex:review` +### `/codex:critique` -Runs a normal Codex review on your current work. It gives you the same quality of code review as running `/review` inside Codex directly. +Hands Codex a **design — before you build it** — for an independent, second-family critique. Codex has full read access to the repository *and*, where a database or MCP tool is available, the live data. It verifies each load-bearing claim in the design against the code (`file:line`) and the data (the actual query and its result), and leads with where the design does *not* hold up. -> [!NOTE] -> Code review especially for multi-file changes might take a while. It's generally recommended to run it in the background. +Reach for this when Claude has produced or endorsed a design and you want a different model to attack it before it gets built — not to review a finished diff (that's [`/codex:adversarial-review`](#codexadversarial-review)) and not to run a task (that's [`/codex:rescue`](#codexrescue)). -Use it when you want: - -- a review of your current uncommitted changes -- a review of your branch compared to a base branch like `main` - -Use `--base ` for branch review. It also supports `--wait` and `--background`. It is not steerable and does not take custom focus text. Use [`/codex:adversarial-review`](#codexadversarial-review) when you want to challenge a specific decision or risk area. - -Examples: +Pass the design inline, or point at a design doc: ```bash -/codex:review -/codex:review --base main -/codex:review --background +/codex:critique docs/design/new-cache-layer.md +/codex:critique --focus "the n is tiny — does the data even support this split?" we plan to ... ``` -This command is read-only and will not perform any changes. When run in the background you can use [`/codex:status`](#codexstatus) to check on the progress and [`/codex:cancel`](#codexcancel) to cancel the ongoing task. +It is read-only and runs in the background by default — it reads the whole repo and queries the database, so it usually runs long. Use [`/codex:status`](#codexstatus) and [`/codex:result`](#codexresult) to follow it. ### `/codex:adversarial-review` @@ -104,8 +97,8 @@ Runs a **steerable** review that questions the chosen implementation and design. It can be used to pressure-test assumptions, tradeoffs, failure modes, and whether a different approach would have been safer or simpler. -It uses the same review target selection as `/codex:review`, including `--base ` for branch review. -It also supports `--wait` and `--background`. Unlike `/codex:review`, it can take extra focus text after the flags. +It reviews your current uncommitted changes by default, or your branch against a base with `--base `. +It also supports `--wait` and `--background`, and takes extra focus text after the flags to steer what it scrutinizes. Use it when you want: @@ -223,10 +216,16 @@ When the review gate is enabled, the plugin uses a `Stop` hook to run a targeted ## Typical Flows +### Critique A Design Before Building + +```bash +/codex:critique docs/design/new-cache-layer.md +``` + ### Review Before Shipping ```bash -/codex:review +/codex:adversarial-review ``` ### Hand A Problem To Codex diff --git a/plugins/codex/commands/adversarial-review.md b/plugins/codex/commands/adversarial-review.md index da440ab4..7ac4fdc1 100644 --- a/plugins/codex/commands/adversarial-review.md +++ b/plugins/codex/commands/adversarial-review.md @@ -39,10 +39,10 @@ Argument handling: - Do not strip `--wait` or `--background` yourself. - Do not weaken the adversarial framing or rewrite the user's focus text. - The companion script parses `--wait` and `--background`, but Claude Code's `Bash(..., run_in_background: true)` is what actually detaches the run. -- `/codex:adversarial-review` uses the same review target selection as `/codex:review`. +- `/codex:adversarial-review` is the one review command for completed work. - It supports working-tree review, branch review, and `--base `. - It does not support `--scope staged` or `--scope unstaged`. -- Unlike `/codex:review`, it can still take extra focus text after the flags. +- It takes extra focus text after the flags to steer what it scrutinizes. Foreground flow: - Run: diff --git a/plugins/codex/commands/result.md b/plugins/codex/commands/result.md index 3abc2d93..75013d4b 100644 --- a/plugins/codex/commands/result.md +++ b/plugins/codex/commands/result.md @@ -12,4 +12,4 @@ Present the full command output to the user. Do not summarize or condense it. Pr - The complete result payload, including verdict, summary, findings, details, artifacts, and next steps - File paths and line numbers exactly as reported - Any error messages or parse errors -- Follow-up commands such as `/codex:status ` and `/codex:review` +- Follow-up commands such as `/codex:status ` and `/codex:adversarial-review` diff --git a/plugins/codex/commands/review.md b/plugins/codex/commands/review.md deleted file mode 100644 index fb70a487..00000000 --- a/plugins/codex/commands/review.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -description: Run a Codex code review against local git state -argument-hint: '[--wait|--background] [--base ] [--scope auto|working-tree|branch]' -disable-model-invocation: true -allowed-tools: Read, Glob, Grep, Bash(node:*), Bash(git:*), AskUserQuestion ---- - -Run a Codex review through the shared built-in reviewer. - -Raw slash-command arguments: -`$ARGUMENTS` - -Core constraint: -- This command is review-only. -- Do not fix issues, apply patches, or suggest that you are about to make changes. -- Your only job is to run the review and return Codex's output verbatim to the user. - -Execution mode rules: -- If the raw arguments include `--wait`, do not ask. Run the review in the foreground. -- If the raw arguments include `--background`, do not ask. Run the review in a Claude background task. -- Otherwise, estimate the review size before asking: - - For working-tree review, start with `git status --short --untracked-files=all`. - - For working-tree review, also inspect both `git diff --shortstat --cached` and `git diff --shortstat`. - - For base-branch review, use `git diff --shortstat ...HEAD`. - - Treat untracked files or directories as reviewable work even when `git diff --shortstat` is empty. - - Only conclude there is nothing to review when the relevant working-tree status is empty or the explicit branch diff is empty. - - Recommend waiting only when the review is clearly tiny, roughly 1-2 files total and no sign of a broader directory-sized change. - - In every other case, including unclear size, recommend background. - - When in doubt, run the review instead of declaring that there is nothing to review. -- Then use `AskUserQuestion` exactly once with two options, putting the recommended option first and suffixing its label with `(Recommended)`: - - `Wait for results` - - `Run in background` - -Argument handling: -- Preserve the user's arguments exactly. -- Do not strip `--wait` or `--background` yourself. -- Do not add extra review instructions or rewrite the user's intent. -- The companion script parses `--wait` and `--background`, but Claude Code's `Bash(..., run_in_background: true)` is what actually detaches the run. -- `/codex:review` is native-review only. It does not support staged-only review, unstaged-only review, or extra focus text. -- If the user needs custom review instructions or more adversarial framing, they should use `/codex:adversarial-review`. - -Foreground flow: -- Run: -```bash -node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" review "$ARGUMENTS" -``` -- Return the command stdout verbatim, exactly as-is. -- Do not paraphrase, summarize, or add commentary before or after it. -- Do not fix any issues mentioned in the review output. - -Background flow: -- Launch the review with `Bash` in the background: -```typescript -Bash({ - command: `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" review "$ARGUMENTS"`, - description: "Codex review", - run_in_background: true -}) -``` -- Do not call `BashOutput` or wait for completion in this turn. -- After launching the command, tell the user: "Codex review started in the background. Check `/codex:status` for progress." diff --git a/plugins/codex/skills/gpt-5-4-prompting/SKILL.md b/plugins/codex/skills/gpt-5-4-prompting/SKILL.md index 16669d92..9d17c213 100644 --- a/plugins/codex/skills/gpt-5-4-prompting/SKILL.md +++ b/plugins/codex/skills/gpt-5-4-prompting/SKILL.md @@ -31,7 +31,7 @@ When to add blocks: - Write-capable tasks: add `action_safety` so Codex stays narrow and avoids unrelated refactors. How to choose prompt shape: -- Use built-in `review` or `adversarial-review` commands when the job is reviewing local git changes. Those prompts already carry the review contract. +- Use the `adversarial-review` command when the job is reviewing completed local git changes, and `critique` when the job is attacking a design before it is built. Those prompts already carry the review/critique contract. - Use `task` when the task is diagnosis, planning, research, or implementation and you need to control the prompt more directly. - Use `task --resume-last` for follow-up instructions on the same Codex thread. Send only the delta instruction instead of restating the whole prompt unless the direction changed materially. diff --git a/tests/commands.test.mjs b/tests/commands.test.mjs index 653f484a..f7162be3 100644 --- a/tests/commands.test.mjs +++ b/tests/commands.test.mjs @@ -11,34 +11,6 @@ function read(relativePath) { return fs.readFileSync(path.join(PLUGIN_ROOT, relativePath), "utf8"); } -test("review command uses AskUserQuestion and background Bash while staying review-only", () => { - const source = read("commands/review.md"); - assert.match(source, /AskUserQuestion/); - assert.match(source, /\bBash\(/); - assert.match(source, /Do not fix issues/i); - assert.match(source, /review-only/i); - assert.match(source, /return Codex's output verbatim to the user/i); - assert.match(source, /```bash/); - assert.match(source, /```typescript/); - assert.match(source, /review "\$ARGUMENTS"/); - assert.match(source, /\[--scope auto\|working-tree\|branch\]/); - assert.match(source, /run_in_background:\s*true/); - assert.match(source, /command:\s*`node "\$\{CLAUDE_PLUGIN_ROOT\}\/scripts\/codex-companion\.mjs" review "\$ARGUMENTS"`/); - assert.match(source, /description:\s*"Codex review"/); - assert.match(source, /Do not call `BashOutput`/); - assert.match(source, /Return the command stdout verbatim, exactly as-is/i); - assert.match(source, /git status --short --untracked-files=all/); - assert.match(source, /git diff --shortstat/); - assert.match(source, /Treat untracked files or directories as reviewable work/i); - assert.match(source, /Recommend waiting only when the review is clearly tiny, roughly 1-2 files total/i); - assert.match(source, /In every other case, including unclear size, recommend background/i); - assert.match(source, /The companion script parses `--wait` and `--background`/i); - assert.match(source, /Claude Code's `Bash\(..., run_in_background: true\)` is what actually detaches the run/i); - assert.match(source, /When in doubt, run the review/i); - assert.match(source, /\(Recommended\)/); - assert.match(source, /does not support staged-only review, unstaged-only review, or extra focus text/i); -}); - test("adversarial review command uses AskUserQuestion and background Bash while staying review-only", () => { const source = read("commands/adversarial-review.md"); assert.match(source, /AskUserQuestion/); @@ -64,10 +36,10 @@ test("adversarial review command uses AskUserQuestion and background Bash while assert.match(source, /Claude Code's `Bash\(..., run_in_background: true\)` is what actually detaches the run/i); assert.match(source, /When in doubt, run the review/i); assert.match(source, /\(Recommended\)/); - assert.match(source, /uses the same review target selection as `\/codex:review`/i); + assert.match(source, /is the one review command for completed work/i); assert.match(source, /supports working-tree review, branch review, and `--base `/i); assert.match(source, /does not support `--scope staged` or `--scope unstaged`/i); - assert.match(source, /can still take extra focus text after the flags/i); + assert.match(source, /takes extra focus text after the flags/i); }); test("continue is not exposed as a user-facing command", () => { @@ -82,7 +54,6 @@ test("continue is not exposed as a user-facing command", () => { "cancel.md", "rescue.md", "result.md", - "review.md", "setup.md", "status.md" ]) { @@ -165,9 +136,8 @@ test("rescue command absorbs continue semantics", () => { assert.match(readme, /`spark`, the plugin maps that to `gpt-5\.3-codex-spark`/i); assert.match(readme, /continue a previous Codex task/i); assert.match(readme, /### `\/codex:setup`/); - assert.match(readme, /### `\/codex:review`/); + assert.match(readme, /### `\/codex:critique`/); assert.match(readme, /### `\/codex:adversarial-review`/); - assert.match(readme, /uses the same review target selection as `\/codex:review`/i); assert.match(readme, /--base main challenge whether this was the right caching and retry design/); assert.match(readme, /### `\/codex:rescue`/); assert.match(readme, /### `\/codex:status`/); From 332f95735f9513e9f4959c1acb6acf60ee69903f Mon Sep 17 00:00:00 2001 From: Mike Edwards Date: Sun, 14 Jun 2026 16:41:47 -0400 Subject: [PATCH 5/5] fix(critique): run /codex:critique in foreground; stop detaching into an orphaned job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /codex:critique previously defaulted to --background, which detached the review into a separate Codex job and returned an instant stub — the subagent finished with nothing and no real completion was produced. - agent: always foreground; strip --background/--wait (Claude-side dispatch controls); set max Bash timeout so long critiques are not cut short - handleCritique: parse+ignore --background/--wait (matches the review commands); remove the detach path; also fixes --wait leaking into design text - README: returns the critique directly; drop stale status/result polling note - bump 1.0.4 -> 1.0.5 + CHANGELOG so the plugin cache refreshes on update Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 2 +- plugins/codex/.claude-plugin/plugin.json | 2 +- plugins/codex/CHANGELOG.md | 6 ++++++ plugins/codex/agents/codex-critique.md | 4 +++- plugins/codex/scripts/codex-companion.mjs | 25 +++++++---------------- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 1aa9ac90..1a1124e0 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ Pass the design inline, or point at a design doc: /codex:critique --focus "the n is tiny — does the data even support this split?" we plan to ... ``` -It is read-only and runs in the background by default — it reads the whole repo and queries the database, so it usually runs long. Use [`/codex:status`](#codexstatus) and [`/codex:result`](#codexresult) to follow it. +It is read-only and returns the critique directly. Because it reads the whole repo and queries the database, it usually runs long, so by default it runs as a background subagent and notifies you when the critique is ready; add `--wait` to run it inline instead. ### `/codex:adversarial-review` diff --git a/plugins/codex/.claude-plugin/plugin.json b/plugins/codex/.claude-plugin/plugin.json index da262028..168b342d 100644 --- a/plugins/codex/.claude-plugin/plugin.json +++ b/plugins/codex/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "codex", - "version": "1.0.4", + "version": "1.0.5", "description": "Use Codex from Claude Code to review code or delegate tasks.", "author": { "name": "OpenAI" diff --git a/plugins/codex/CHANGELOG.md b/plugins/codex/CHANGELOG.md index d647561b..be52c3df 100644 --- a/plugins/codex/CHANGELOG.md +++ b/plugins/codex/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 1.0.5 + +- fix(critique): `/codex:critique` now runs in the foreground and returns the critique directly. It previously detached the work into a background Codex job and returned an instant stub, so the subagent "finished" with nothing and never produced a real completion. Backgrounding is now purely a subagent-dispatch concern. +- The `codex-critique` agent always runs `critique` in the foreground, strips `--background`/`--wait` (Claude-side dispatch controls), and sets a max Bash timeout so long critiques aren't cut short. +- `handleCritique` parses but ignores `--background`/`--wait` (matching the review commands), removing the detach path and fixing `--wait` leaking into the design text. + ## 1.0.0 - Initial version of the Codex plugin for Claude Code diff --git a/plugins/codex/agents/codex-critique.md b/plugins/codex/agents/codex-critique.md index d9bbd8ed..d1032d9c 100644 --- a/plugins/codex/agents/codex-critique.md +++ b/plugins/codex/agents/codex-critique.md @@ -19,8 +19,10 @@ Selection guidance: Forwarding rules: - Use exactly one `Bash` call to invoke `node "${CLAUDE_PLUGIN_ROOT}/scripts/codex-companion.mjs" critique ...`. +- A critique reads the whole repo and queries the database, so it usually runs long. Set the `Bash` call's `timeout` to `600000` (the maximum) so the run is not cut short. - The `critique` run is READ-ONLY by design (a critique never edits). Never add `--write`. -- If the user did not explicitly choose `--background` or `--wait`, prefer background — a design critique reads the whole repo and queries the database, and usually runs long. +- Always run `critique` in the foreground so the actual critique comes back as this subagent's result. Never add `--background`. Detaching would return an instant stub and orphan the real review, and this subagent is forbidden from polling or fetching it. +- `--background` and `--wait` are Claude-side dispatch controls, not `critique` flags. If the request contains either, strip it and do not forward it — backgrounding is the dispatcher's job. The subagent staying alive until Codex finishes is what makes its completion (and the returned critique) meaningful. - Pass the user's design reference through unchanged: the design text itself as the positional argument, or `--prompt-file ` when they point at a design-doc file. - If the user supplies `--focus ""`, forward it as `--focus ""` — it steers what the critique scrutinizes. - Treat `--effort ` and `--model ` as runtime controls and forward them; do not include them in the design text. If the user asks for `spark`, map it to `--model gpt-5.3-codex-spark`. diff --git a/plugins/codex/scripts/codex-companion.mjs b/plugins/codex/scripts/codex-companion.mjs index 609d74ef..0e3d5ff8 100644 --- a/plugins/codex/scripts/codex-companion.mjs +++ b/plugins/codex/scripts/codex-companion.mjs @@ -796,10 +796,16 @@ async function handleTask(argv) { // Same task engine as handleTask, but the prompt wraps the user's design in the design-critique // template and the run is READ-ONLY (a critique never edits). The design reference is read the // same way a task prompt is (positional text or --prompt-file). +// +// Unlike `task`, critique always runs in the foreground (like the review commands): it is only +// ever invoked by the `codex:codex-critique` subagent, so backgrounding it would detach the work +// into an orphaned job and return an instant stub. Backgrounding a critique is a Claude-side +// dispatch concern (run the subagent in the background). `--background`/`--wait` are parsed only +// so they cannot leak into the design text, then ignored. async function handleCritique(argv) { const { options, positionals } = parseCommandInput(argv, { valueOptions: ["model", "effort", "cwd", "prompt-file", "focus"], - booleanOptions: ["json", "background"], + booleanOptions: ["json", "background", "wait"], aliasMap: { m: "model" } @@ -818,23 +824,6 @@ async function handleCritique(argv) { }); const taskMetadata = buildTaskRunMetadata({ prompt, resumeLast: false }); - if (options.background) { - ensureCodexAvailable(cwd); - const job = buildTaskJob(workspaceRoot, taskMetadata, false); - const request = buildTaskRequest({ - cwd, - model, - effort, - prompt, - write: false, - resumeLast: false, - jobId: job.id - }); - const { payload } = enqueueBackgroundTask(cwd, job, request); - outputCommandResult(payload, renderQueuedTaskLaunch(payload), options.json); - return; - } - const job = buildTaskJob(workspaceRoot, taskMetadata, false); await runForegroundCommand( job,